summaryrefslogtreecommitdiff
path: root/usr.sbin
diff options
context:
space:
mode:
authorDave Voutila <dv@cvs.openbsd.org>2024-07-10 09:27:34 +0000
committerDave Voutila <dv@cvs.openbsd.org>2024-07-10 09:27:34 +0000
commit26f622afa9ed15c419509076cbab0468de61c4ae (patch)
treefc14924faa9194f88b49521a671264a34dd7934c /usr.sbin
parentb5018cbeea8ea496722a2466e247b93acb4c4e8f (diff)
Split vmd into mi/md parts.
Makes as much of the core of vmd mi, pushing x86-isms into separate compilation units. Adds build logic for arm64, but no emulation yet. (You can build vmd, but it won't have a vmm device to connect to.) Some more cleanup probably needed around interrupt controller abstraction, but that can come as we implement more than the i8259. ok mlarkin@
Diffstat (limited to 'usr.sbin')
-rw-r--r--usr.sbin/vmctl/Makefile4
-rw-r--r--usr.sbin/vmd/Makefile23
-rw-r--r--usr.sbin/vmd/i8253.c5
-rw-r--r--usr.sbin/vmd/mc146818.c5
-rw-r--r--usr.sbin/vmd/mmio.c1046
-rw-r--r--usr.sbin/vmd/ns8250.c7
-rw-r--r--usr.sbin/vmd/pci.c16
-rw-r--r--usr.sbin/vmd/pci.h11
-rw-r--r--usr.sbin/vmd/vioblk.c6
-rw-r--r--usr.sbin/vmd/virtio.c23
-rw-r--r--usr.sbin/vmd/virtio.h6
-rw-r--r--usr.sbin/vmd/vm.c1247
-rw-r--r--usr.sbin/vmd/vmd.c131
-rw-r--r--usr.sbin/vmd/vmd.h47
-rw-r--r--usr.sbin/vmd/vmm.c6
15 files changed, 151 insertions, 2432 deletions
diff --git a/usr.sbin/vmctl/Makefile b/usr.sbin/vmctl/Makefile
index 4dab88fba7b..05359639a3b 100644
--- a/usr.sbin/vmctl/Makefile
+++ b/usr.sbin/vmctl/Makefile
@@ -1,6 +1,6 @@
-# $OpenBSD: Makefile,v 1.6 2019/01/18 01:24:07 pd Exp $
+# $OpenBSD: Makefile,v 1.7 2024/07/10 09:27:33 dv Exp $
-.if ${MACHINE} == "amd64"
+.if ${MACHINE} == "amd64" || ${MACHINE} == "arm64"
.PATH: ${.CURDIR}/../vmd
diff --git a/usr.sbin/vmd/Makefile b/usr.sbin/vmd/Makefile
index 3fbb9d086b1..22c1e887823 100644
--- a/usr.sbin/vmd/Makefile
+++ b/usr.sbin/vmd/Makefile
@@ -1,13 +1,20 @@
-# $OpenBSD: Makefile,v 1.29 2023/04/27 22:47:27 dv Exp $
+# $OpenBSD: Makefile,v 1.30 2024/07/10 09:27:33 dv Exp $
-.if ${MACHINE} == "amd64"
+.if ${MACHINE} == "amd64" || ${MACHINE} == "arm64"
PROG= vmd
-SRCS= vmd.c control.c log.c priv.c proc.c config.c vmm.c
-SRCS+= vm.c loadfile_elf.c pci.c virtio.c i8259.c mc146818.c
-SRCS+= ns8250.c i8253.c dhcp.c packet.c mmio.c
-SRCS+= parse.y atomicio.c vioscsi.c vioraw.c vioqcow2.c fw_cfg.c
-SRCS+= vm_agentx.c vioblk.c vionet.c
+SRCS= vmd.c control.c log.c priv.c proc.c config.c vmm.c vm.c
+SRCS+= pci.c virtio.c dhcp.c packet.c parse.y atomicio.c
+SRCS+= vioscsi.c vioraw.c vioqcow2.c vm_agentx.c vioblk.c
+SRCS+= vionet.c
+
+.if ${MACHINE} == "amd64"
+SRCS+= i8253.c i8259.c fw_cfg.c loadfile_elf.c mc146818.c ns8250.c
+SRCS+= x86_vm.c x86_mmio.c
+.endif # amd64
+.if ${MACHINE} == "arm64"
+SRCS+= arm64_vm.c
+.endif # arm64
CFLAGS+= -Wall -I${.CURDIR}
CFLAGS+= -Wstrict-prototypes -Wmissing-prototypes
@@ -24,7 +31,7 @@ YFLAGS=
NOPROG= yes
-.endif
+.endif # amd64 or arm64
MAN= vmd.8 vm.conf.5
diff --git a/usr.sbin/vmd/i8253.c b/usr.sbin/vmd/i8253.c
index ac9855e38be..7cea3fa3869 100644
--- a/usr.sbin/vmd/i8253.c
+++ b/usr.sbin/vmd/i8253.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: i8253.c,v 1.40 2024/07/09 09:31:37 dv Exp $ */
+/* $OpenBSD: i8253.c,v 1.41 2024/07/10 09:27:33 dv Exp $ */
/*
* Copyright (c) 2016 Mike Larkin <mlarkin@openbsd.org>
*
@@ -29,7 +29,6 @@
#include "i8253.h"
#include "vmd.h"
-#include "vmm.h"
#include "atomicio.h"
extern char *__progname;
@@ -369,7 +368,7 @@ i8253_fire(int fd, short type, void *arg)
struct timeval tv;
struct i8253_channel *ctr = (struct i8253_channel *)arg;
- vcpu_assert_pic_irq(ctr->vm_id, 0, 0);
+ vcpu_assert_irq(ctr->vm_id, 0, 0);
if (ctr->mode != TIMER_INTTC) {
timerclear(&tv);
diff --git a/usr.sbin/vmd/mc146818.c b/usr.sbin/vmd/mc146818.c
index 660c625ebeb..62fc6459a8f 100644
--- a/usr.sbin/vmd/mc146818.c
+++ b/usr.sbin/vmd/mc146818.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: mc146818.c,v 1.28 2024/07/09 09:31:37 dv Exp $ */
+/* $OpenBSD: mc146818.c,v 1.29 2024/07/10 09:27:33 dv Exp $ */
/*
* Copyright (c) 2016 Mike Larkin <mlarkin@openbsd.org>
*
@@ -31,7 +31,6 @@
#include "mc146818.h"
#include "virtio.h"
#include "vmd.h"
-#include "vmm.h"
#define MC_RATE_MASK 0xf
@@ -148,7 +147,7 @@ rtc_fireper(int fd, short type, void *arg)
{
rtc.regs[MC_REGC] |= MC_REGC_PF;
- vcpu_assert_pic_irq((ptrdiff_t)arg, 0, 8);
+ vcpu_assert_irq((ptrdiff_t)arg, 0, 8);
evtimer_add(&rtc.per, &rtc.per_tv);
}
diff --git a/usr.sbin/vmd/mmio.c b/usr.sbin/vmd/mmio.c
index c5a189d5b85..e69de29bb2d 100644
--- a/usr.sbin/vmd/mmio.c
+++ b/usr.sbin/vmd/mmio.c
@@ -1,1046 +0,0 @@
-/* $OpenBSD: mmio.c,v 1.3 2024/02/10 12:31:16 dv Exp $ */
-
-/*
- * Copyright (c) 2022 Dave Voutila <dv@openbsd.org>
- *
- * Permission to use, copy, modify, and distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
-
-#include <errno.h>
-#include <string.h>
-
-#include <sys/types.h>
-#include <machine/specialreg.h>
-
-#include "vmd.h"
-#include "mmio.h"
-
-#define MMIO_DEBUG 0
-
-extern char* __progname;
-
-struct x86_decode_state {
- uint8_t s_bytes[15];
- size_t s_len;
- size_t s_idx;
-};
-
-enum decode_result {
- DECODE_ERROR = 0, /* Something went wrong. */
- DECODE_DONE, /* Decode success and no more work needed. */
- DECODE_MORE, /* Decode success and more work required. */
-};
-
-static const char *str_cpu_mode(int);
-static const char *str_decode_res(enum decode_result);
-static const char *str_opcode(struct x86_opcode *);
-static const char *str_operand_enc(struct x86_opcode *);
-static const char *str_reg(int);
-static const char *str_sreg(int);
-static int detect_cpu_mode(struct vcpu_reg_state *);
-
-static enum decode_result decode_prefix(struct x86_decode_state *,
- struct x86_insn *);
-static enum decode_result decode_opcode(struct x86_decode_state *,
- struct x86_insn *);
-static enum decode_result decode_modrm(struct x86_decode_state *,
- struct x86_insn *);
-static int get_modrm_reg(struct x86_insn *);
-static int get_modrm_addr(struct x86_insn *, struct vcpu_reg_state *vrs);
-static enum decode_result decode_disp(struct x86_decode_state *,
- struct x86_insn *);
-static enum decode_result decode_sib(struct x86_decode_state *,
- struct x86_insn *);
-static enum decode_result decode_imm(struct x86_decode_state *,
- struct x86_insn *);
-
-static enum decode_result peek_byte(struct x86_decode_state *, uint8_t *);
-static enum decode_result next_byte(struct x86_decode_state *, uint8_t *);
-static enum decode_result next_value(struct x86_decode_state *, size_t,
- uint64_t *);
-static int is_valid_state(struct x86_decode_state *, const char *);
-
-static int emulate_mov(struct x86_insn *, struct vm_exit *);
-static int emulate_movzx(struct x86_insn *, struct vm_exit *);
-
-/* Lookup table for 1-byte opcodes, in opcode alphabetical order. */
-const enum x86_opcode_type x86_1byte_opcode_tbl[255] = {
- /* MOV */
- [0x88] = OP_MOV,
- [0x89] = OP_MOV,
- [0x8A] = OP_MOV,
- [0x8B] = OP_MOV,
- [0x8C] = OP_MOV,
- [0xA0] = OP_MOV,
- [0xA1] = OP_MOV,
- [0xA2] = OP_MOV,
- [0xA3] = OP_MOV,
-
- /* MOVS */
- [0xA4] = OP_UNSUPPORTED,
- [0xA5] = OP_UNSUPPORTED,
-
- [ESCAPE] = OP_TWO_BYTE,
-};
-
-/* Lookup table for 1-byte operand encodings, in opcode alphabetical order. */
-const enum x86_operand_enc x86_1byte_operand_enc_tbl[255] = {
- /* MOV */
- [0x88] = OP_ENC_MR,
- [0x89] = OP_ENC_MR,
- [0x8A] = OP_ENC_RM,
- [0x8B] = OP_ENC_RM,
- [0x8C] = OP_ENC_MR,
- [0xA0] = OP_ENC_FD,
- [0xA1] = OP_ENC_FD,
- [0xA2] = OP_ENC_TD,
- [0xA3] = OP_ENC_TD,
-
- /* MOVS */
- [0xA4] = OP_ENC_ZO,
- [0xA5] = OP_ENC_ZO,
-};
-
-const enum x86_opcode_type x86_2byte_opcode_tbl[255] = {
- /* MOVZX */
- [0xB6] = OP_MOVZX,
- [0xB7] = OP_MOVZX,
-};
-
-const enum x86_operand_enc x86_2byte_operand_enc_table[255] = {
- /* MOVZX */
- [0xB6] = OP_ENC_RM,
- [0xB7] = OP_ENC_RM,
-};
-
-/*
- * peek_byte
- *
- * Fetch the next byte fron the instruction bytes without advancing the
- * position in the stream.
- *
- * Return values:
- * DECODE_DONE: byte was found and is the last in the stream
- * DECODE_MORE: byte was found and there are more remaining to be read
- * DECODE_ERROR: state is invalid and not byte was found, *byte left unchanged
- */
-static enum decode_result
-peek_byte(struct x86_decode_state *state, uint8_t *byte)
-{
- enum decode_result res;
-
- if (state == NULL)
- return (DECODE_ERROR);
-
- if (state->s_idx == state->s_len)
- return (DECODE_ERROR);
-
- if (state->s_idx + 1 == state->s_len)
- res = DECODE_DONE;
- else
- res = DECODE_MORE;
-
- if (byte != NULL)
- *byte = state->s_bytes[state->s_idx];
- return (res);
-}
-
-/*
- * next_byte
- *
- * Fetch the next byte fron the instruction bytes, advancing the position in the
- * stream and mutating decode state.
- *
- * Return values:
- * DECODE_DONE: byte was found and is the last in the stream
- * DECODE_MORE: byte was found and there are more remaining to be read
- * DECODE_ERROR: state is invalid and not byte was found, *byte left unchanged
- */
-static enum decode_result
-next_byte(struct x86_decode_state *state, uint8_t *byte)
-{
- uint8_t next;
-
- /* Cheat and see if we're going to fail. */
- if (peek_byte(state, &next) == DECODE_ERROR)
- return (DECODE_ERROR);
-
- if (byte != NULL)
- *byte = next;
- state->s_idx++;
-
- return (state->s_idx < state->s_len ? DECODE_MORE : DECODE_DONE);
-}
-
-/*
- * Fetch the next `n' bytes as a single uint64_t value.
- */
-static enum decode_result
-next_value(struct x86_decode_state *state, size_t n, uint64_t *value)
-{
- uint8_t bytes[8];
- size_t i;
- enum decode_result res;
-
- if (value == NULL)
- return (DECODE_ERROR);
-
- if (n == 0 || n > sizeof(bytes))
- return (DECODE_ERROR);
-
- memset(bytes, 0, sizeof(bytes));
- for (i = 0; i < n; i++)
- if ((res = next_byte(state, &bytes[i])) == DECODE_ERROR)
- return (DECODE_ERROR);
-
- *value = *((uint64_t*)bytes);
-
- return (res);
-}
-
-/*
- * is_valid_state
- *
- * Validate the decode state looks viable.
- *
- * Returns:
- * 1: if state is valid
- * 0: if an invariant is detected
- */
-static int
-is_valid_state(struct x86_decode_state *state, const char *fn_name)
-{
- const char *s = (fn_name != NULL) ? fn_name : __func__;
-
- if (state == NULL) {
- log_warnx("%s: null state", s);
- return (0);
- }
- if (state->s_len > sizeof(state->s_bytes)) {
- log_warnx("%s: invalid length", s);
- return (0);
- }
- if (state->s_idx + 1 > state->s_len) {
- log_warnx("%s: invalid index", s);
- return (0);
- }
-
- return (1);
-}
-
-#ifdef MMIO_DEBUG
-static void
-dump_regs(struct vcpu_reg_state *vrs)
-{
- size_t i;
- struct vcpu_segment_info *vsi;
-
- for (i = 0; i < VCPU_REGS_NGPRS; i++)
- log_info("%s: %s 0x%llx", __progname, str_reg(i),
- vrs->vrs_gprs[i]);
-
- for (i = 0; i < VCPU_REGS_NSREGS; i++) {
- vsi = &vrs->vrs_sregs[i];
- log_info("%s: %s { sel: 0x%04x, lim: 0x%08x, ar: 0x%08x, "
- "base: 0x%llx }", __progname, str_sreg(i),
- vsi->vsi_sel, vsi->vsi_limit, vsi->vsi_ar, vsi->vsi_base);
- }
-}
-
-static void
-dump_insn(struct x86_insn *insn)
-{
- log_info("instruction { %s, enc=%s, len=%d, mod=0x%02x, ("
- "reg=%s, addr=0x%lx) sib=0x%02x }",
- str_opcode(&insn->insn_opcode),
- str_operand_enc(&insn->insn_opcode), insn->insn_bytes_len,
- insn->insn_modrm, str_reg(insn->insn_reg),
- insn->insn_gva, insn->insn_sib);
-}
-#endif /* MMIO_DEBUG */
-
-static const char *
-str_cpu_mode(int mode)
-{
- switch (mode) {
- case VMM_CPU_MODE_REAL: return "REAL";
- case VMM_CPU_MODE_PROT: return "PROT";
- case VMM_CPU_MODE_PROT32: return "PROT32";
- case VMM_CPU_MODE_COMPAT: return "COMPAT";
- case VMM_CPU_MODE_LONG: return "LONG";
- default: return "UKNOWN";
- }
-}
-
-__unused static const char *
-str_decode_res(enum decode_result res) {
- switch (res) {
- case DECODE_DONE: return "DONE";
- case DECODE_MORE: return "MORE";
- case DECODE_ERROR: return "ERROR";
- default: return "UNKNOWN";
- }
-}
-
-static const char *
-str_opcode(struct x86_opcode *opcode)
-{
- switch (opcode->op_type) {
- case OP_IN: return "IN";
- case OP_INS: return "INS";
- case OP_MOV: return "MOV";
- case OP_MOVZX: return "MOVZX";
- case OP_OUT: return "OUT";
- case OP_OUTS: return "OUTS";
- case OP_UNSUPPORTED: return "UNSUPPORTED";
- default: return "UNKNOWN";
- }
-}
-
-static const char *
-str_operand_enc(struct x86_opcode *opcode)
-{
- switch (opcode->op_encoding) {
- case OP_ENC_I: return "I";
- case OP_ENC_MI: return "MI";
- case OP_ENC_MR: return "MR";
- case OP_ENC_RM: return "RM";
- case OP_ENC_FD: return "FD";
- case OP_ENC_TD: return "TD";
- case OP_ENC_OI: return "OI";
- case OP_ENC_ZO: return "ZO";
- default: return "UNKNOWN";
- }
-}
-
-static const char *
-str_reg(int reg) {
- switch (reg) {
- case VCPU_REGS_RAX: return "RAX";
- case VCPU_REGS_RCX: return "RCX";
- case VCPU_REGS_RDX: return "RDX";
- case VCPU_REGS_RBX: return "RBX";
- case VCPU_REGS_RSI: return "RSI";
- case VCPU_REGS_RDI: return "RDI";
- case VCPU_REGS_R8: return " R8";
- case VCPU_REGS_R9: return " R9";
- case VCPU_REGS_R10: return "R10";
- case VCPU_REGS_R11: return "R11";
- case VCPU_REGS_R12: return "R12";
- case VCPU_REGS_R13: return "R13";
- case VCPU_REGS_R14: return "R14";
- case VCPU_REGS_R15: return "R15";
- case VCPU_REGS_RSP: return "RSP";
- case VCPU_REGS_RBP: return "RBP";
- case VCPU_REGS_RIP: return "RIP";
- case VCPU_REGS_RFLAGS: return "RFLAGS";
- default: return "UNKNOWN";
- }
-}
-
-static const char *
-str_sreg(int sreg) {
- switch (sreg) {
- case VCPU_REGS_CS: return "CS";
- case VCPU_REGS_DS: return "DS";
- case VCPU_REGS_ES: return "ES";
- case VCPU_REGS_FS: return "FS";
- case VCPU_REGS_GS: return "GS";
- case VCPU_REGS_SS: return "GS";
- case VCPU_REGS_LDTR: return "LDTR";
- case VCPU_REGS_TR: return "TR";
- default: return "UKNOWN";
- }
-}
-
-static int
-detect_cpu_mode(struct vcpu_reg_state *vrs)
-{
- uint64_t cr0, cr4, cs, efer, rflags;
-
- /* Is protected mode enabled? */
- cr0 = vrs->vrs_crs[VCPU_REGS_CR0];
- if (!(cr0 & CR0_PE))
- return (VMM_CPU_MODE_REAL);
-
- cr4 = vrs->vrs_crs[VCPU_REGS_CR4];
- cs = vrs->vrs_sregs[VCPU_REGS_CS].vsi_ar;
- efer = vrs->vrs_msrs[VCPU_REGS_EFER];
- rflags = vrs->vrs_gprs[VCPU_REGS_RFLAGS];
-
- /* Check for Long modes. */
- if ((efer & EFER_LME) && (cr4 & CR4_PAE) && (cr0 & CR0_PG)) {
- if (cs & CS_L) {
- /* Long Modes */
- if (!(cs & CS_D))
- return (VMM_CPU_MODE_LONG);
- log_warnx("%s: invalid cpu mode", __progname);
- return (VMM_CPU_MODE_UNKNOWN);
- } else {
- /* Compatibility Modes */
- if (cs & CS_D) /* XXX Add Compat32 mode */
- return (VMM_CPU_MODE_UNKNOWN);
- return (VMM_CPU_MODE_COMPAT);
- }
- }
-
- /* Check for 32-bit Protected Mode. */
- if (cs & CS_D)
- return (VMM_CPU_MODE_PROT32);
-
- /* Check for virtual 8086 mode. */
- if (rflags & EFLAGS_VM) {
- /* XXX add Virtual8086 mode */
- log_warnx("%s: Virtual 8086 mode", __progname);
- return (VMM_CPU_MODE_UNKNOWN);
- }
-
- /* Can't determine mode. */
- log_warnx("%s: invalid cpu mode", __progname);
- return (VMM_CPU_MODE_UNKNOWN);
-}
-
-static enum decode_result
-decode_prefix(struct x86_decode_state *state, struct x86_insn *insn)
-{
- enum decode_result res = DECODE_ERROR;
- struct x86_prefix *prefix;
- uint8_t byte;
-
- if (!is_valid_state(state, __func__) || insn == NULL)
- return (-1);
-
- prefix = &insn->insn_prefix;
- memset(prefix, 0, sizeof(*prefix));
-
- /*
- * Decode prefixes. The last of its kind wins. The behavior is undefined
- * in the Intel SDM (see Vol 2, 2.1.1 Instruction Prefixes.)
- */
- while ((res = peek_byte(state, &byte)) != DECODE_ERROR) {
- switch (byte) {
- case LEG_1_LOCK:
- case LEG_1_REPNE:
- case LEG_1_REP:
- prefix->pfx_group1 = byte;
- break;
- case LEG_2_CS:
- case LEG_2_SS:
- case LEG_2_DS:
- case LEG_2_ES:
- case LEG_2_FS:
- case LEG_2_GS:
- prefix->pfx_group2 = byte;
- break;
- case LEG_3_OPSZ:
- prefix->pfx_group3 = byte;
- break;
- case LEG_4_ADDRSZ:
- prefix->pfx_group4 = byte;
- break;
- case REX_BASE...REX_BASE + 0x0F:
- if (insn->insn_cpu_mode == VMM_CPU_MODE_LONG)
- prefix->pfx_rex = byte;
- else /* INC encountered */
- return (DECODE_ERROR);
- break;
- case VEX_2_BYTE:
- case VEX_3_BYTE:
- log_warnx("%s: VEX not supported", __func__);
- return (DECODE_ERROR);
- default:
- /* Something other than a valid prefix. */
- return (DECODE_MORE);
- }
- /* Advance our position. */
- next_byte(state, NULL);
- }
-
- return (res);
-}
-
-static enum decode_result
-decode_modrm(struct x86_decode_state *state, struct x86_insn *insn)
-{
- enum decode_result res;
- uint8_t byte = 0;
-
- if (!is_valid_state(state, __func__) || insn == NULL)
- return (DECODE_ERROR);
-
- insn->insn_modrm_valid = 0;
-
- /* Check the operand encoding to see if we fetch a byte or abort. */
- switch (insn->insn_opcode.op_encoding) {
- case OP_ENC_MR:
- case OP_ENC_RM:
- case OP_ENC_MI:
- res = next_byte(state, &byte);
- if (res == DECODE_ERROR) {
- log_warnx("%s: failed to get modrm byte", __func__);
- break;
- }
- insn->insn_modrm = byte;
- insn->insn_modrm_valid = 1;
- break;
-
- case OP_ENC_I:
- case OP_ENC_OI:
- log_warnx("%s: instruction does not need memory assist",
- __func__);
- res = DECODE_ERROR;
- break;
-
- default:
- /* Peek to see if we're done decode. */
- res = peek_byte(state, NULL);
- }
-
- return (res);
-}
-
-static int
-get_modrm_reg(struct x86_insn *insn)
-{
- if (insn == NULL)
- return (-1);
-
- if (insn->insn_modrm_valid) {
- switch (MODRM_REGOP(insn->insn_modrm)) {
- case 0:
- insn->insn_reg = VCPU_REGS_RAX;
- break;
- case 1:
- insn->insn_reg = VCPU_REGS_RCX;
- break;
- case 2:
- insn->insn_reg = VCPU_REGS_RDX;
- break;
- case 3:
- insn->insn_reg = VCPU_REGS_RBX;
- break;
- case 4:
- insn->insn_reg = VCPU_REGS_RSP;
- break;
- case 5:
- insn->insn_reg = VCPU_REGS_RBP;
- break;
- case 6:
- insn->insn_reg = VCPU_REGS_RSI;
- break;
- case 7:
- insn->insn_reg = VCPU_REGS_RDI;
- break;
- }
- }
-
- /* REX R bit selects extended registers in LONG mode. */
- if (insn->insn_prefix.pfx_rex & REX_R)
- insn->insn_reg += 8;
-
- return (0);
-}
-
-static int
-get_modrm_addr(struct x86_insn *insn, struct vcpu_reg_state *vrs)
-{
- uint8_t mod, rm;
- vaddr_t addr = 0x0UL;
-
- if (insn == NULL || vrs == NULL)
- return (-1);
-
- if (insn->insn_modrm_valid) {
- rm = MODRM_RM(insn->insn_modrm);
- mod = MODRM_MOD(insn->insn_modrm);
-
- switch (rm) {
- case 0b000:
- addr = vrs->vrs_gprs[VCPU_REGS_RAX];
- break;
- case 0b001:
- addr = vrs->vrs_gprs[VCPU_REGS_RCX];
- break;
- case 0b010:
- addr = vrs->vrs_gprs[VCPU_REGS_RDX];
- break;
- case 0b011:
- addr = vrs->vrs_gprs[VCPU_REGS_RBX];
- break;
- case 0b100:
- if (mod == 0b11)
- addr = vrs->vrs_gprs[VCPU_REGS_RSP];
- break;
- case 0b101:
- if (mod != 0b00)
- addr = vrs->vrs_gprs[VCPU_REGS_RBP];
- break;
- case 0b110:
- addr = vrs->vrs_gprs[VCPU_REGS_RSI];
- break;
- case 0b111:
- addr = vrs->vrs_gprs[VCPU_REGS_RDI];
- break;
- }
-
- insn->insn_gva = addr;
- }
-
- return (0);
-}
-
-static enum decode_result
-decode_disp(struct x86_decode_state *state, struct x86_insn *insn)
-{
- enum decode_result res = DECODE_ERROR;
- uint64_t disp = 0;
-
- if (!is_valid_state(state, __func__) || insn == NULL)
- return (DECODE_ERROR);
-
- if (!insn->insn_modrm_valid)
- return (DECODE_ERROR);
-
- switch (MODRM_MOD(insn->insn_modrm)) {
- case 0x00:
- insn->insn_disp_type = DISP_0;
- res = DECODE_MORE;
- break;
- case 0x01:
- insn->insn_disp_type = DISP_1;
- res = next_value(state, 1, &disp);
- if (res == DECODE_ERROR)
- return (res);
- insn->insn_disp = disp;
- break;
- case 0x02:
- if (insn->insn_prefix.pfx_group4 == LEG_4_ADDRSZ) {
- insn->insn_disp_type = DISP_2;
- res = next_value(state, 2, &disp);
- } else {
- insn->insn_disp_type = DISP_4;
- res = next_value(state, 4, &disp);
- }
- if (res == DECODE_ERROR)
- return (res);
- insn->insn_disp = disp;
- break;
- default:
- insn->insn_disp_type = DISP_NONE;
- res = DECODE_MORE;
- }
-
- return (res);
-}
-
-static enum decode_result
-decode_opcode(struct x86_decode_state *state, struct x86_insn *insn)
-{
- enum decode_result res;
- enum x86_opcode_type type;
- enum x86_operand_enc enc;
- struct x86_opcode *opcode = &insn->insn_opcode;
- uint8_t byte, byte2;
-
- if (!is_valid_state(state, __func__) || insn == NULL)
- return (-1);
-
- memset(opcode, 0, sizeof(*opcode));
-
- res = next_byte(state, &byte);
- if (res == DECODE_ERROR)
- return (res);
-
- type = x86_1byte_opcode_tbl[byte];
- switch(type) {
- case OP_UNKNOWN:
- case OP_UNSUPPORTED:
- log_warnx("%s: unsupported opcode", __func__);
- return (DECODE_ERROR);
-
- case OP_TWO_BYTE:
- res = next_byte(state, &byte2);
- if (res == DECODE_ERROR)
- return (res);
-
- type = x86_2byte_opcode_tbl[byte2];
- if (type == OP_UNKNOWN || type == OP_UNSUPPORTED) {
- log_warnx("%s: unsupported 2-byte opcode", __func__);
- return (DECODE_ERROR);
- }
-
- opcode->op_bytes[0] = byte;
- opcode->op_bytes[1] = byte2;
- opcode->op_bytes_len = 2;
- enc = x86_2byte_operand_enc_table[byte2];
- break;
-
- default:
- /* We've potentially got a known 1-byte opcode. */
- opcode->op_bytes[0] = byte;
- opcode->op_bytes_len = 1;
- enc = x86_1byte_operand_enc_tbl[byte];
- }
-
- if (enc == OP_ENC_UNKNOWN)
- return (DECODE_ERROR);
-
- opcode->op_type = type;
- opcode->op_encoding = enc;
-
- return (res);
-}
-
-static enum decode_result
-decode_sib(struct x86_decode_state *state, struct x86_insn *insn)
-{
- enum decode_result res;
- uint8_t byte;
-
- if (!is_valid_state(state, __func__) || insn == NULL)
- return (-1);
-
- /* SIB is optional, so assume we will be continuing. */
- res = DECODE_MORE;
-
- insn->insn_sib_valid = 0;
- if (!insn->insn_modrm_valid)
- return (res);
-
- /* XXX is SIB valid in all cpu modes? */
- if (MODRM_RM(insn->insn_modrm) == 0b100) {
- res = next_byte(state, &byte);
- if (res != DECODE_ERROR) {
- insn->insn_sib_valid = 1;
- insn->insn_sib = byte;
- }
- }
-
- return (res);
-}
-
-static enum decode_result
-decode_imm(struct x86_decode_state *state, struct x86_insn *insn)
-{
- enum decode_result res;
- size_t num_bytes;
- uint64_t value;
-
- if (!is_valid_state(state, __func__) || insn == NULL)
- return (DECODE_ERROR);
-
- /* Only handle MI encoded instructions. Others shouldn't need assist. */
- if (insn->insn_opcode.op_encoding != OP_ENC_MI)
- return (DECODE_DONE);
-
- /* Exceptions related to MOV instructions. */
- if (insn->insn_opcode.op_type == OP_MOV) {
- switch (insn->insn_opcode.op_bytes[0]) {
- case 0xC6:
- num_bytes = 1;
- break;
- case 0xC7:
- if (insn->insn_cpu_mode == VMM_CPU_MODE_REAL)
- num_bytes = 2;
- else
- num_bytes = 4;
- break;
- default:
- log_warnx("%s: cannot decode immediate bytes for MOV",
- __func__);
- return (DECODE_ERROR);
- }
- } else {
- /* Fallback to interpreting based on cpu mode and REX. */
- if (insn->insn_cpu_mode == VMM_CPU_MODE_REAL)
- num_bytes = 2;
- else if (insn->insn_prefix.pfx_rex == REX_NONE)
- num_bytes = 4;
- else
- num_bytes = 8;
- }
-
- res = next_value(state, num_bytes, &value);
- if (res != DECODE_ERROR) {
- insn->insn_immediate = value;
- insn->insn_immediate_len = num_bytes;
- }
-
- return (res);
-}
-
-
-/*
- * insn_decode
- *
- * Decode an x86 instruction from the provided instruction bytes.
- *
- * Return values:
- * 0: successful decode
- * Non-zero: an exception occurred during decode
- */
-int
-insn_decode(struct vm_exit *exit, struct x86_insn *insn)
-{
- enum decode_result res;
- struct vcpu_reg_state *vrs = &exit->vrs;
- struct x86_decode_state state;
- uint8_t *bytes, len;
- int mode;
-
- if (exit == NULL || insn == NULL) {
- log_warnx("%s: invalid input", __func__);
- return (DECODE_ERROR);
- }
-
- bytes = exit->vee.vee_insn_bytes;
- len = exit->vee.vee_insn_len;
-
- /* 0. Initialize state and instruction objects. */
- memset(insn, 0, sizeof(*insn));
- memset(&state, 0, sizeof(state));
- state.s_len = len;
- memcpy(&state.s_bytes, bytes, len);
-
- /* 1. Detect CPU mode. */
- mode = detect_cpu_mode(vrs);
- if (mode == VMM_CPU_MODE_UNKNOWN) {
- log_warnx("%s: failed to identify cpu mode", __func__);
-#ifdef MMIO_DEBUG
- dump_regs(vrs);
-#endif
- return (-1);
- }
- insn->insn_cpu_mode = mode;
-
-#ifdef MMIO_DEBUG
- log_info("%s: cpu mode %s detected", __progname, str_cpu_mode(mode));
- printf("%s: got bytes: [ ", __progname);
- for (int i = 0; i < len; i++) {
- printf("%02x ", bytes[i]);
- }
- printf("]\n");
-#endif
- /* 2. Decode prefixes. */
- res = decode_prefix(&state, insn);
- if (res == DECODE_ERROR) {
- log_warnx("%s: error decoding prefixes", __func__);
- goto err;
- } else if (res == DECODE_DONE)
- goto done;
-
-#ifdef MMIO_DEBUG
- log_info("%s: prefixes {g1: 0x%02x, g2: 0x%02x, g3: 0x%02x, g4: 0x%02x,"
- " rex: 0x%02x }", __progname, insn->insn_prefix.pfx_group1,
- insn->insn_prefix.pfx_group2, insn->insn_prefix.pfx_group3,
- insn->insn_prefix.pfx_group4, insn->insn_prefix.pfx_rex);
-#endif
-
- /* 3. Pick apart opcode. Here we can start short-circuiting. */
- res = decode_opcode(&state, insn);
- if (res == DECODE_ERROR) {
- log_warnx("%s: error decoding opcode", __func__);
- goto err;
- } else if (res == DECODE_DONE)
- goto done;
-
-#ifdef MMIO_DEBUG
- log_info("%s: found opcode %s (operand encoding %s) (%s)", __progname,
- str_opcode(&insn->insn_opcode), str_operand_enc(&insn->insn_opcode),
- str_decode_res(res));
-#endif
-
- /* Process optional ModR/M byte. */
- res = decode_modrm(&state, insn);
- if (res == DECODE_ERROR) {
- log_warnx("%s: error decoding modrm", __func__);
- goto err;
- }
- if (get_modrm_addr(insn, vrs) != 0)
- goto err;
- if (get_modrm_reg(insn) != 0)
- goto err;
- if (res == DECODE_DONE)
- goto done;
-
-#ifdef MMIO_DEBUG
- if (insn->insn_modrm_valid)
- log_info("%s: found ModRM 0x%02x (%s)", __progname,
- insn->insn_modrm, str_decode_res(res));
-#endif
-
- /* Process optional SIB byte. */
- res = decode_sib(&state, insn);
- if (res == DECODE_ERROR) {
- log_warnx("%s: error decoding sib", __func__);
- goto err;
- } else if (res == DECODE_DONE)
- goto done;
-
-#ifdef MMIO_DEBUG
- if (insn->insn_sib_valid)
- log_info("%s: found SIB 0x%02x (%s)", __progname,
- insn->insn_sib, str_decode_res(res));
-#endif
-
- /* Process any Displacement bytes. */
- res = decode_disp(&state, insn);
- if (res == DECODE_ERROR) {
- log_warnx("%s: error decoding displacement", __func__);
- goto err;
- } else if (res == DECODE_DONE)
- goto done;
-
- /* Process any Immediate data bytes. */
- res = decode_imm(&state, insn);
- if (res == DECODE_ERROR) {
- log_warnx("%s: error decoding immediate bytes", __func__);
- goto err;
- }
-
-done:
- insn->insn_bytes_len = state.s_idx;
-
-#ifdef MMIO_DEBUG
- log_info("%s: final instruction length is %u", __func__,
- insn->insn_bytes_len);
- dump_insn(insn);
- log_info("%s: modrm: {mod: %d, regop: %d, rm: %d}", __func__,
- MODRM_MOD(insn->insn_modrm), MODRM_REGOP(insn->insn_modrm),
- MODRM_RM(insn->insn_modrm));
- dump_regs(vrs);
-#endif /* MMIO_DEBUG */
- return (0);
-
-err:
-#ifdef MMIO_DEBUG
- dump_insn(insn);
- log_info("%s: modrm: {mod: %d, regop: %d, rm: %d}", __func__,
- MODRM_MOD(insn->insn_modrm), MODRM_REGOP(insn->insn_modrm),
- MODRM_RM(insn->insn_modrm));
- dump_regs(vrs);
-#endif /* MMIO_DEBUG */
- return (-1);
-}
-
-static int
-emulate_mov(struct x86_insn *insn, struct vm_exit *exit)
-{
- /* XXX Only supports read to register for now */
- if (insn->insn_opcode.op_encoding != OP_ENC_RM)
- return (-1);
-
- /* XXX No device emulation yet. Fill with 0xFFs. */
- exit->vrs.vrs_gprs[insn->insn_reg] = 0xFFFFFFFFFFFFFFFF;
-
- return (0);
-}
-
-static int
-emulate_movzx(struct x86_insn *insn, struct vm_exit *exit)
-{
- uint8_t byte, len, src = 1, dst = 2;
- uint64_t value = 0;
-
- /* Only RM is valid for MOVZX. */
- if (insn->insn_opcode.op_encoding != OP_ENC_RM) {
- log_warnx("invalid op encoding for MOVZX: %d",
- insn->insn_opcode.op_encoding);
- return (-1);
- }
-
- len = insn->insn_opcode.op_bytes_len;
- if (len < 1 || len > sizeof(insn->insn_opcode.op_bytes)) {
- log_warnx("invalid opcode byte length: %d", len);
- return (-1);
- }
-
- byte = insn->insn_opcode.op_bytes[len - 1];
- switch (byte) {
- case 0xB6:
- src = 1;
- if (insn->insn_cpu_mode == VMM_CPU_MODE_PROT
- || insn->insn_cpu_mode == VMM_CPU_MODE_REAL)
- dst = 2;
- else if (insn->insn_prefix.pfx_rex == REX_NONE)
- dst = 4;
- else // XXX validate CPU mode
- dst = 8;
- break;
- case 0xB7:
- src = 2;
- if (insn->insn_prefix.pfx_rex == REX_NONE)
- dst = 4;
- else // XXX validate CPU mode
- dst = 8;
- break;
- default:
- log_warnx("invalid byte in MOVZX opcode: %x", byte);
- return (-1);
- }
-
- if (dst == 4)
- exit->vrs.vrs_gprs[insn->insn_reg] &= 0xFFFFFFFF00000000;
- else
- exit->vrs.vrs_gprs[insn->insn_reg] = 0x0UL;
-
- /* XXX No device emulation yet. Fill with 0xFFs. */
- switch (src) {
- case 1: value = 0xFF; break;
- case 2: value = 0xFFFF; break;
- case 4: value = 0xFFFFFFFF; break;
- case 8: value = 0xFFFFFFFFFFFFFFFF; break;
- default:
- log_warnx("invalid source size: %d", src);
- return (-1);
- }
-
- exit->vrs.vrs_gprs[insn->insn_reg] |= value;
-
- return (0);
-}
-
-/*
- * insn_emulate
- *
- * Returns:
- * 0: success
- * EINVAL: exception occurred
- * EFAULT: page fault occurred, requires retry
- * ENOTSUP: an unsupported instruction was provided
- */
-int
-insn_emulate(struct vm_exit *exit, struct x86_insn *insn)
-{
- int res;
-
- switch (insn->insn_opcode.op_type) {
- case OP_MOV:
- res = emulate_mov(insn, exit);
- break;
-
- case OP_MOVZX:
- res = emulate_movzx(insn, exit);
- break;
-
- default:
- log_warnx("%s: emulation not defined for %s", __func__,
- str_opcode(&insn->insn_opcode));
- res = ENOTSUP;
- }
-
- if (res == 0)
- exit->vrs.vrs_gprs[VCPU_REGS_RIP] += insn->insn_bytes_len;
-
- return (res);
-}
diff --git a/usr.sbin/vmd/ns8250.c b/usr.sbin/vmd/ns8250.c
index bcb48ef95d8..17cc8bfe525 100644
--- a/usr.sbin/vmd/ns8250.c
+++ b/usr.sbin/vmd/ns8250.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: ns8250.c,v 1.39 2024/07/09 09:31:37 dv Exp $ */
+/* $OpenBSD: ns8250.c,v 1.40 2024/07/10 09:27:33 dv Exp $ */
/*
* Copyright (c) 2016 Mike Larkin <mlarkin@openbsd.org>
*
@@ -30,7 +30,6 @@
#include "atomicio.h"
#include "ns8250.h"
#include "vmd.h"
-#include "vmm.h"
extern char *__progname;
struct ns8250_dev com1_dev;
@@ -80,7 +79,7 @@ ratelimit(int fd, short type, void *arg)
com1_dev.regs.iir |= IIR_TXRDY;
com1_dev.regs.iir &= ~IIR_NOPEND;
- vcpu_assert_pic_irq(com1_dev.vmid, 0, com1_dev.irq);
+ vcpu_assert_irq(com1_dev.vmid, 0, com1_dev.irq);
mutex_unlock(&com1_dev.mutex);
}
@@ -157,7 +156,7 @@ com_rcv_event(int fd, short kind, void *arg)
/* If pending interrupt, inject */
if ((com1_dev.regs.iir & IIR_NOPEND) == 0) {
/* XXX: vcpu_id */
- vcpu_assert_pic_irq((uintptr_t)arg, 0, com1_dev.irq);
+ vcpu_assert_irq((uintptr_t)arg, 0, com1_dev.irq);
}
mutex_unlock(&com1_dev.mutex);
diff --git a/usr.sbin/vmd/pci.c b/usr.sbin/vmd/pci.c
index 1722baa9ea1..0dbe846fd01 100644
--- a/usr.sbin/vmd/pci.c
+++ b/usr.sbin/vmd/pci.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: pci.c,v 1.32 2024/07/09 09:31:37 dv Exp $ */
+/* $OpenBSD: pci.c,v 1.33 2024/07/10 09:27:33 dv Exp $ */
/*
* Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
@@ -28,12 +28,12 @@
#include "vmd.h"
#include "pci.h"
-#include "vmm.h"
#include "i8259.h"
#include "atomicio.h"
struct pci pci;
+extern struct vmd_vm current_vm;
extern char *__progname;
/* PIC IRQs, assigned to devices in order */
@@ -86,7 +86,9 @@ pci_add_bar(uint8_t id, uint32_t type, void *barfn, void *cookie)
pci.pci_devices[id].pd_bartype[bar_ct] = PCI_BAR_TYPE_MMIO;
pci.pci_devices[id].pd_barsize[bar_ct] = VM_PCI_MMIO_BAR_SIZE;
pci.pci_devices[id].pd_bar_ct++;
- } else if (type == PCI_MAPREG_TYPE_IO) {
+ }
+#ifdef __amd64__
+ else if (type == PCI_MAPREG_TYPE_IO) {
if (pci.pci_next_io_bar >= VM_PCI_IO_BAR_END)
return (1);
@@ -102,6 +104,7 @@ pci_add_bar(uint8_t id, uint32_t type, void *barfn, void *cookie)
pci.pci_devices[id].pd_barsize[bar_ct] = VM_PCI_IO_BAR_SIZE;
pci.pci_devices[id].pd_bar_ct++;
}
+#endif /* __amd64__ */
return (0);
}
@@ -195,7 +198,7 @@ pci_add_device(uint8_t *id, uint16_t vid, uint16_t pid, uint8_t class,
pci.pci_next_pic_irq++;
DPRINTF("assigned irq %d to pci dev %d",
pci.pci_devices[*id].pd_irq, *id);
- pic_set_elcr(pci.pci_devices[*id].pd_irq, 1);
+ intr_toggle_el(&current_vm, pci.pci_devices[*id].pd_irq, 1);
}
pci.pci_dev_ct ++;
@@ -216,7 +219,10 @@ pci_init(void)
memset(&pci, 0, sizeof(pci));
pci.pci_next_mmio_bar = VMM_PCI_MMIO_BAR_BASE;
+
+#ifdef __amd64__
pci.pci_next_io_bar = VM_PCI_IO_BAR_BASE;
+#endif /* __amd64__ */
if (pci_add_device(&id, PCI_VENDOR_OPENBSD, PCI_PRODUCT_OPENBSD_PCHB,
PCI_CLASS_BRIDGE, PCI_SUBCLASS_BRIDGE_HOST,
@@ -226,6 +232,7 @@ pci_init(void)
}
}
+#ifdef __amd64__
void
pci_handle_address_reg(struct vm_run_params *vrp)
{
@@ -415,6 +422,7 @@ pci_handle_data_reg(struct vm_run_params *vrp)
}
}
}
+#endif /* __amd64__ */
int
pci_dump(int fd)
diff --git a/usr.sbin/vmd/pci.h b/usr.sbin/vmd/pci.h
index 73b54437bed..0b05a9298d1 100644
--- a/usr.sbin/vmd/pci.h
+++ b/usr.sbin/vmd/pci.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: pci.h,v 1.10 2023/02/06 20:33:34 dv Exp $ */
+/* $OpenBSD: pci.h,v 1.11 2024/07/10 09:27:33 dv Exp $ */
/*
* Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
@@ -93,9 +93,6 @@ struct pci {
};
int pci_find_first_device(uint16_t);
-void pci_handle_address_reg(struct vm_run_params *);
-void pci_handle_data_reg(struct vm_run_params *);
-uint8_t pci_handle_io(struct vm_run_params *);
void pci_init(void);
int pci_add_device(uint8_t *, uint16_t, uint16_t, uint8_t, uint8_t, uint16_t,
uint16_t, uint8_t, pci_cs_fn_t);
@@ -105,4 +102,10 @@ uint8_t pci_get_dev_irq(uint8_t);
int pci_dump(int);
int pci_restore(int);
+#ifdef __amd64__
+void pci_handle_address_reg(struct vm_run_params *);
+void pci_handle_data_reg(struct vm_run_params *);
+uint8_t pci_handle_io(struct vm_run_params *);
+#endif /* __amd64__ */
+
#endif /* _PCI_H_ */
diff --git a/usr.sbin/vmd/vioblk.c b/usr.sbin/vmd/vioblk.c
index 6e3e3147536..cef10e32cf4 100644
--- a/usr.sbin/vmd/vioblk.c
+++ b/usr.sbin/vmd/vioblk.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: vioblk.c,v 1.13 2024/02/20 21:40:37 dv Exp $ */
+/* $OpenBSD: vioblk.c,v 1.14 2024/07/10 09:27:33 dv Exp $ */
/*
* Copyright (c) 2023 Dave Voutila <dv@openbsd.org>
@@ -555,7 +555,7 @@ handle_sync_io(int fd, short event, void *arg)
case VIODEV_MSG_IO_WRITE:
/* Write IO: no reply needed */
if (handle_io_write(&msg, dev) == 1)
- virtio_assert_pic_irq(dev, 0);
+ virtio_assert_irq(dev, 0);
break;
case VIODEV_MSG_SHUTDOWN:
event_del(&dev->sync_iev.ev);
@@ -614,7 +614,7 @@ handle_io_write(struct viodev_msg *msg, struct virtio_dev *dev)
vioblk->cfg.isr_status = 0;
vioblk->vq[0].last_avail = 0;
vioblk->vq[0].notified_avail = 0;
- virtio_deassert_pic_irq(dev, msg->vcpu);
+ virtio_deassert_irq(dev, msg->vcpu);
}
break;
default:
diff --git a/usr.sbin/vmd/virtio.c b/usr.sbin/vmd/virtio.c
index 80d035ef60b..f203f822adc 100644
--- a/usr.sbin/vmd/virtio.c
+++ b/usr.sbin/vmd/virtio.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: virtio.c,v 1.114 2024/07/09 09:31:37 dv Exp $ */
+/* $OpenBSD: virtio.c,v 1.115 2024/07/10 09:27:33 dv Exp $ */
/*
* Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
@@ -47,7 +47,6 @@
#include "vioscsi.h"
#include "virtio.h"
#include "vmd.h"
-#include "vmm.h"
extern struct vmd *env;
extern char *__progname;
@@ -274,7 +273,7 @@ virtio_rnd_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr,
case VIRTIO_CONFIG_ISR_STATUS:
*data = viornd.cfg.isr_status;
viornd.cfg.isr_status = 0;
- vcpu_deassert_pic_irq(viornd.vm_id, 0, viornd.irq);
+ vcpu_deassert_irq(viornd.vm_id, 0, viornd.irq);
break;
}
}
@@ -310,7 +309,7 @@ vmmci_ctl(unsigned int cmd)
/* Trigger interrupt */
vmmci.cfg.isr_status = VIRTIO_CONFIG_ISR_CONFIG_CHANGE;
- vcpu_assert_pic_irq(vmmci.vm_id, 0, vmmci.irq);
+ vcpu_assert_irq(vmmci.vm_id, 0, vmmci.irq);
/* Add ACK timeout */
tv.tv_sec = VMMCI_TIMEOUT;
@@ -322,7 +321,7 @@ vmmci_ctl(unsigned int cmd)
vmmci.cmd = cmd;
vmmci.cfg.isr_status = VIRTIO_CONFIG_ISR_CONFIG_CHANGE;
- vcpu_assert_pic_irq(vmmci.vm_id, 0, vmmci.irq);
+ vcpu_assert_irq(vmmci.vm_id, 0, vmmci.irq);
} else {
log_debug("%s: RTC sync skipped (guest does not "
"support RTC sync)\n", __func__);
@@ -468,7 +467,7 @@ vmmci_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr,
case VIRTIO_CONFIG_ISR_STATUS:
*data = vmmci.cfg.isr_status;
vmmci.cfg.isr_status = 0;
- vcpu_deassert_pic_irq(vmmci.vm_id, 0, vmmci.irq);
+ vcpu_deassert_irq(vmmci.vm_id, 0, vmmci.irq);
break;
}
}
@@ -1586,9 +1585,9 @@ handle_dev_msg(struct viodev_msg *msg, struct virtio_dev *gdev)
switch (msg->type) {
case VIODEV_MSG_KICK:
if (msg->state == INTR_STATE_ASSERT)
- vcpu_assert_pic_irq(vm_id, msg->vcpu, irq);
+ vcpu_assert_irq(vm_id, msg->vcpu, irq);
else if (msg->state == INTR_STATE_DEASSERT)
- vcpu_deassert_pic_irq(vm_id, msg->vcpu, irq);
+ vcpu_deassert_irq(vm_id, msg->vcpu, irq);
break;
case VIODEV_MSG_READY:
log_debug("%s: device reports ready", __func__);
@@ -1702,9 +1701,9 @@ virtio_pci_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr,
* device performs a register read.
*/
if (msg.state == INTR_STATE_ASSERT)
- vcpu_assert_pic_irq(dev->vm_id, msg.vcpu, msg.irq);
+ vcpu_assert_irq(dev->vm_id, msg.vcpu, msg.irq);
else if (msg.state == INTR_STATE_DEASSERT)
- vcpu_deassert_pic_irq(dev->vm_id, msg.vcpu, msg.irq);
+ vcpu_deassert_irq(dev->vm_id, msg.vcpu, msg.irq);
} else {
log_warnx("%s: expected IO_READ, got %d", __func__,
msg.type);
@@ -1716,7 +1715,7 @@ virtio_pci_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr,
}
void
-virtio_assert_pic_irq(struct virtio_dev *dev, int vcpu)
+virtio_assert_irq(struct virtio_dev *dev, int vcpu)
{
struct viodev_msg msg;
int ret;
@@ -1734,7 +1733,7 @@ virtio_assert_pic_irq(struct virtio_dev *dev, int vcpu)
}
void
-virtio_deassert_pic_irq(struct virtio_dev *dev, int vcpu)
+virtio_deassert_irq(struct virtio_dev *dev, int vcpu)
{
struct viodev_msg msg;
int ret;
diff --git a/usr.sbin/vmd/virtio.h b/usr.sbin/vmd/virtio.h
index 58f2c216837..c293743050c 100644
--- a/usr.sbin/vmd/virtio.h
+++ b/usr.sbin/vmd/virtio.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: virtio.h,v 1.51 2024/02/20 21:40:37 dv Exp $ */
+/* $OpenBSD: virtio.h,v 1.52 2024/07/10 09:27:33 dv Exp $ */
/*
* Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
@@ -346,8 +346,8 @@ uint32_t vring_size(uint32_t);
int vm_device_pipe(struct virtio_dev *, void (*)(int, short, void *),
struct event_base *);
int virtio_pci_io(int, uint16_t, uint32_t *, uint8_t *, void *, uint8_t);
-void virtio_assert_pic_irq(struct virtio_dev *, int);
-void virtio_deassert_pic_irq(struct virtio_dev *, int);
+void virtio_assert_irq(struct virtio_dev *, int);
+void virtio_deassert_irq(struct virtio_dev *, int);
int virtio_rnd_io(int, uint16_t, uint32_t *, uint8_t *, void *, uint8_t);
int viornd_dump(int);
diff --git a/usr.sbin/vmd/vm.c b/usr.sbin/vmd/vm.c
index 078e9b5172f..e8c73b0e053 100644
--- a/usr.sbin/vmd/vm.c
+++ b/usr.sbin/vmd/vm.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: vm.c,v 1.103 2024/07/09 09:31:37 dv Exp $ */
+/* $OpenBSD: vm.c,v 1.104 2024/07/10 09:27:33 dv Exp $ */
/*
* Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
@@ -22,21 +22,14 @@
#include <sys/queue.h>
#include <sys/wait.h>
#include <sys/uio.h>
-#include <sys/stat.h>
#include <sys/socket.h>
#include <sys/time.h>
#include <sys/mman.h>
#include <sys/resource.h>
-#include <dev/ic/i8253reg.h>
-#include <dev/isa/isareg.h>
#include <dev/pci/pcireg.h>
#include <dev/vmm/vmm.h>
-#include <machine/psl.h>
-#include <machine/pte.h>
-#include <machine/specialreg.h>
-
#include <net/if.h>
#include <errno.h>
@@ -55,57 +48,28 @@
#include <util.h>
#include "atomicio.h"
-#include "fw_cfg.h"
-#include "i8253.h"
-#include "i8259.h"
-#include "loadfile.h"
-#include "mc146818.h"
#include "mmio.h"
-#include "ns8250.h"
#include "pci.h"
#include "virtio.h"
#include "vmd.h"
-#include "vmm.h"
-
-#define MB(x) (x * 1024UL * 1024UL)
-#define GB(x) (x * 1024UL * 1024UL * 1024UL)
#define MMIO_NOTYET 0
-io_fn_t ioports_map[MAX_PORTS];
-
static int run_vm(struct vmop_create_params *, struct vcpu_reg_state *);
-void vm_dispatch_vmm(int, short, void *);
-void *event_thread(void *);
-void *vcpu_run_loop(void *);
-int vcpu_exit(struct vm_run_params *);
-int vcpu_reset(uint32_t, uint32_t, struct vcpu_reg_state *);
-void create_memory_map(struct vm_create_params *);
+static void vm_dispatch_vmm(int, short, void *);
+static void *event_thread(void *);
+static void *vcpu_run_loop(void *);
static int vmm_create_vm(struct vmd_vm *);
-int alloc_guest_mem(struct vmd_vm *);
-void init_emulated_hw(struct vmop_create_params *, int,
- int[][VM_MAX_BASE_PER_DISK], int *);
-void restore_emulated_hw(struct vm_create_params *, int, int *,
- int[][VM_MAX_BASE_PER_DISK],int);
-void vcpu_exit_inout(struct vm_run_params *);
-int vcpu_exit_eptviolation(struct vm_run_params *);
-uint8_t vcpu_exit_pci(struct vm_run_params *);
-int vcpu_pic_intr(uint32_t, uint32_t, uint8_t);
-int loadfile_bios(gzFile, off_t, struct vcpu_reg_state *);
+static int alloc_guest_mem(struct vmd_vm *);
static int send_vm(int, struct vmd_vm *);
-int dump_send_header(int);
static int dump_vmr(int , struct vm_mem_range *);
static int dump_mem(int, struct vmd_vm *);
-void restore_vmr(int, struct vm_mem_range *);
-void restore_mem(int, struct vm_create_params *);
-int restore_vm_params(int, struct vm_create_params *);
+static void restore_vmr(int, struct vm_mem_range *);
+static void restore_mem(int, struct vm_create_params *);
+static int restore_vm_params(int, struct vm_create_params *);
static void pause_vm(struct vmd_vm *);
static void unpause_vm(struct vmd_vm *);
-
-int translate_gva(struct vm_exit*, uint64_t, uint64_t *, int);
-
-static struct vm_mem_range *find_gpa_range(struct vm_create_params *, paddr_t,
- size_t);
+static int start_vm(struct vmd_vm *, int);
int con_fd;
struct vmd_vm *current_vm;
@@ -128,93 +92,6 @@ uint8_t vcpu_hlt[VMM_MAX_VCPUS_PER_VM];
uint8_t vcpu_done[VMM_MAX_VCPUS_PER_VM];
/*
- * Represents a standard register set for an OS to be booted
- * as a flat 64 bit address space.
- *
- * NOT set here are:
- * RIP
- * RSP
- * GDTR BASE
- *
- * Specific bootloaders should clone this structure and override
- * those fields as needed.
- *
- * Note - CR3 and various bits in CR0 may be overridden by vmm(4) based on
- * features of the CPU in use.
- */
-static const struct vcpu_reg_state vcpu_init_flat64 = {
- .vrs_gprs[VCPU_REGS_RFLAGS] = 0x2,
- .vrs_gprs[VCPU_REGS_RIP] = 0x0,
- .vrs_gprs[VCPU_REGS_RSP] = 0x0,
- .vrs_crs[VCPU_REGS_CR0] = CR0_ET | CR0_PE | CR0_PG,
- .vrs_crs[VCPU_REGS_CR3] = PML4_PAGE,
- .vrs_crs[VCPU_REGS_CR4] = CR4_PAE | CR4_PSE,
- .vrs_crs[VCPU_REGS_PDPTE0] = 0ULL,
- .vrs_crs[VCPU_REGS_PDPTE1] = 0ULL,
- .vrs_crs[VCPU_REGS_PDPTE2] = 0ULL,
- .vrs_crs[VCPU_REGS_PDPTE3] = 0ULL,
- .vrs_sregs[VCPU_REGS_CS] = { 0x8, 0xFFFFFFFF, 0xC09F, 0x0},
- .vrs_sregs[VCPU_REGS_DS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
- .vrs_sregs[VCPU_REGS_ES] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
- .vrs_sregs[VCPU_REGS_FS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
- .vrs_sregs[VCPU_REGS_GS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
- .vrs_sregs[VCPU_REGS_SS] = { 0x10, 0xFFFFFFFF, 0xC093, 0x0},
- .vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0},
- .vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0},
- .vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0},
- .vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0},
- .vrs_msrs[VCPU_REGS_EFER] = EFER_LME | EFER_LMA,
- .vrs_drs[VCPU_REGS_DR0] = 0x0,
- .vrs_drs[VCPU_REGS_DR1] = 0x0,
- .vrs_drs[VCPU_REGS_DR2] = 0x0,
- .vrs_drs[VCPU_REGS_DR3] = 0x0,
- .vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0,
- .vrs_drs[VCPU_REGS_DR7] = 0x400,
- .vrs_msrs[VCPU_REGS_STAR] = 0ULL,
- .vrs_msrs[VCPU_REGS_LSTAR] = 0ULL,
- .vrs_msrs[VCPU_REGS_CSTAR] = 0ULL,
- .vrs_msrs[VCPU_REGS_SFMASK] = 0ULL,
- .vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL,
- .vrs_msrs[VCPU_REGS_MISC_ENABLE] = 0ULL,
- .vrs_crs[VCPU_REGS_XCR0] = XFEATURE_X87
-};
-
-/*
- * Represents a standard register set for an BIOS to be booted
- * as a flat 16 bit address space.
- */
-static const struct vcpu_reg_state vcpu_init_flat16 = {
- .vrs_gprs[VCPU_REGS_RFLAGS] = 0x2,
- .vrs_gprs[VCPU_REGS_RIP] = 0xFFF0,
- .vrs_gprs[VCPU_REGS_RSP] = 0x0,
- .vrs_crs[VCPU_REGS_CR0] = 0x60000010,
- .vrs_crs[VCPU_REGS_CR3] = 0,
- .vrs_sregs[VCPU_REGS_CS] = { 0xF000, 0xFFFF, 0x809F, 0xF0000},
- .vrs_sregs[VCPU_REGS_DS] = { 0x0, 0xFFFF, 0x8093, 0x0},
- .vrs_sregs[VCPU_REGS_ES] = { 0x0, 0xFFFF, 0x8093, 0x0},
- .vrs_sregs[VCPU_REGS_FS] = { 0x0, 0xFFFF, 0x8093, 0x0},
- .vrs_sregs[VCPU_REGS_GS] = { 0x0, 0xFFFF, 0x8093, 0x0},
- .vrs_sregs[VCPU_REGS_SS] = { 0x0, 0xFFFF, 0x8093, 0x0},
- .vrs_gdtr = { 0x0, 0xFFFF, 0x0, 0x0},
- .vrs_idtr = { 0x0, 0xFFFF, 0x0, 0x0},
- .vrs_sregs[VCPU_REGS_LDTR] = { 0x0, 0xFFFF, 0x0082, 0x0},
- .vrs_sregs[VCPU_REGS_TR] = { 0x0, 0xFFFF, 0x008B, 0x0},
- .vrs_msrs[VCPU_REGS_EFER] = 0ULL,
- .vrs_drs[VCPU_REGS_DR0] = 0x0,
- .vrs_drs[VCPU_REGS_DR1] = 0x0,
- .vrs_drs[VCPU_REGS_DR2] = 0x0,
- .vrs_drs[VCPU_REGS_DR3] = 0x0,
- .vrs_drs[VCPU_REGS_DR6] = 0xFFFF0FF0,
- .vrs_drs[VCPU_REGS_DR7] = 0x400,
- .vrs_msrs[VCPU_REGS_STAR] = 0ULL,
- .vrs_msrs[VCPU_REGS_LSTAR] = 0ULL,
- .vrs_msrs[VCPU_REGS_CSTAR] = 0ULL,
- .vrs_msrs[VCPU_REGS_SFMASK] = 0ULL,
- .vrs_msrs[VCPU_REGS_KGSBASE] = 0ULL,
- .vrs_crs[VCPU_REGS_XCR0] = XFEATURE_X87
-};
-
-/*
* vm_main
*
* Primary entrypoint for launching a vm. Does not return.
@@ -291,58 +168,6 @@ vm_main(int fd, int fd_vmm)
}
/*
- * loadfile_bios
- *
- * Alternatively to loadfile_elf, this function loads a non-ELF BIOS image
- * directly into memory.
- *
- * Parameters:
- * fp: file of a kernel file to load
- * size: uncompressed size of the image
- * (out) vrs: register state to set on init for this kernel
- *
- * Return values:
- * 0 if successful
- * various error codes returned from read(2) or loadelf functions
- */
-int
-loadfile_bios(gzFile fp, off_t size, struct vcpu_reg_state *vrs)
-{
- off_t off;
-
- /* Set up a "flat 16 bit" register state for BIOS */
- memcpy(vrs, &vcpu_init_flat16, sizeof(*vrs));
-
- /* Seek to the beginning of the BIOS image */
- if (gzseek(fp, 0, SEEK_SET) == -1)
- return (-1);
-
- /* The BIOS image must end at 1MB */
- if ((off = MB(1) - size) < 0)
- return (-1);
-
- /* Read BIOS image into memory */
- if (mread(fp, off, size) != (size_t)size) {
- errno = EIO;
- return (-1);
- }
-
- if (gzseek(fp, 0, SEEK_SET) == -1)
- return (-1);
-
- /* Read a second BIOS copy into memory ending at 4GB */
- off = GB(4) - size;
- if (mread(fp, off, size) != (size_t)size) {
- errno = EIO;
- return (-1);
- }
-
- log_debug("%s: loaded BIOS image", __func__);
-
- return (0);
-}
-
-/*
* start_vm
*
* After forking a new VM process, starts the new VM with the creation
@@ -372,10 +197,8 @@ start_vm(struct vmd_vm *vm, int fd)
struct vcpu_reg_state vrs;
int nicfds[VM_MAX_NICS_PER_VM];
int ret;
- gzFile fp;
size_t i;
struct vm_rwregs_params vrp;
- struct stat sb;
/*
* We first try to initialize and allocate memory before bothering
@@ -433,33 +256,8 @@ start_vm(struct vmd_vm *vm, int fd)
if (ret != sizeof(vrp))
fatal("received incomplete vrp - exiting");
vrs = vrp.vrwp_regs;
- } else {
- /*
- * Set up default "flat 64 bit" register state - RIP,
- * RSP, and GDT info will be set in bootloader
- */
- memcpy(&vrs, &vcpu_init_flat64, sizeof(vrs));
-
- /* Find and open kernel image */
- if ((fp = gzdopen(vm->vm_kernel, "r")) == NULL)
- fatalx("failed to open kernel - exiting");
-
- /* Load kernel image */
- ret = loadfile_elf(fp, vm, &vrs, vmc->vmc_bootdevice);
-
- /*
- * Try BIOS as a fallback (only if it was provided as an image
- * with vm->vm_kernel and the file is not compressed)
- */
- if (ret && errno == ENOEXEC && vm->vm_kernel != -1 &&
- gzdirect(fp) && (ret = fstat(vm->vm_kernel, &sb)) == 0)
- ret = loadfile_bios(fp, sb.st_size, &vrs);
-
- if (ret)
- fatal("failed to load kernel or BIOS - exiting");
-
- gzclose(fp);
- }
+ } else if (load_firmware(vm, &vrs))
+ fatalx("failed to load kernel or firmware image");
if (vm->vm_kernel != -1)
close_fd(vm->vm_kernel);
@@ -721,15 +519,7 @@ send_vm(int fd, struct vmd_vm *vm)
/* Dump memory before devices to aid in restoration. */
if ((ret = dump_mem(fd, vm)))
goto err;
- if ((ret = i8253_dump(fd)))
- goto err;
- if ((ret = i8259_dump(fd)))
- goto err;
- if ((ret = ns8250_dump(fd)))
- goto err;
- if ((ret = mc146818_dump(fd)))
- goto err;
- if ((ret = fw_cfg_dump(fd)))
+ if ((ret = dump_devs(fd)))
goto err;
if ((ret = pci_dump(fd)))
goto err;
@@ -765,46 +555,6 @@ err:
}
int
-dump_send_header(int fd) {
- struct vm_dump_header vmh;
- int i;
-
- memcpy(&vmh.vmh_signature, VM_DUMP_SIGNATURE,
- sizeof(vmh.vmh_signature));
-
- vmh.vmh_cpuids[0].code = 0x00;
- vmh.vmh_cpuids[0].leaf = 0x00;
-
- vmh.vmh_cpuids[1].code = 0x01;
- vmh.vmh_cpuids[1].leaf = 0x00;
-
- vmh.vmh_cpuids[2].code = 0x07;
- vmh.vmh_cpuids[2].leaf = 0x00;
-
- vmh.vmh_cpuids[3].code = 0x0d;
- vmh.vmh_cpuids[3].leaf = 0x00;
-
- vmh.vmh_cpuids[4].code = 0x80000001;
- vmh.vmh_cpuids[4].leaf = 0x00;
-
- vmh.vmh_version = VM_DUMP_VERSION;
-
- for (i=0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) {
- CPUID_LEAF(vmh.vmh_cpuids[i].code,
- vmh.vmh_cpuids[i].leaf,
- vmh.vmh_cpuids[i].a,
- vmh.vmh_cpuids[i].b,
- vmh.vmh_cpuids[i].c,
- vmh.vmh_cpuids[i].d);
- }
-
- if (atomicio(vwrite, fd, &vmh, sizeof(vmh)) != sizeof(vmh))
- return (-1);
-
- return (0);
-}
-
-int
dump_mem(int fd, struct vmd_vm *vm)
{
unsigned int i;
@@ -933,10 +683,7 @@ pause_vm(struct vmd_vm *vm)
return;
}
- i8253_stop();
- mc146818_stop();
- ns8250_stop();
- virtio_stop(vm);
+ pause_vm_md(vm);
}
static void
@@ -962,10 +709,7 @@ unpause_vm(struct vmd_vm *vm)
}
}
- i8253_start();
- mc146818_start();
- ns8250_start();
- virtio_start(vm);
+ unpause_vm_md(vm);
}
/*
@@ -1003,99 +747,6 @@ vcpu_reset(uint32_t vmid, uint32_t vcpu_id, struct vcpu_reg_state *vrs)
}
/*
- * create_memory_map
- *
- * Sets up the guest physical memory ranges that the VM can access.
- *
- * Parameters:
- * vcp: VM create parameters describing the VM whose memory map
- * is being created
- *
- * Return values:
- * nothing
- */
-void
-create_memory_map(struct vm_create_params *vcp)
-{
- size_t len, mem_bytes;
- size_t above_1m = 0, above_4g = 0;
-
- mem_bytes = vcp->vcp_memranges[0].vmr_size;
- vcp->vcp_nmemranges = 0;
- if (mem_bytes == 0 || mem_bytes > VMM_MAX_VM_MEM_SIZE)
- return;
-
- /* First memory region: 0 - LOWMEM_KB (DOS low mem) */
- len = LOWMEM_KB * 1024;
- vcp->vcp_memranges[0].vmr_gpa = 0x0;
- vcp->vcp_memranges[0].vmr_size = len;
- vcp->vcp_memranges[0].vmr_type = VM_MEM_RAM;
- mem_bytes -= len;
-
- /*
- * Second memory region: LOWMEM_KB - 1MB.
- *
- * N.B. - Normally ROMs or parts of video RAM are mapped here.
- * We have to add this region, because some systems
- * unconditionally write to 0xb8000 (VGA RAM), and
- * we need to make sure that vmm(4) permits accesses
- * to it. So allocate guest memory for it.
- */
- len = MB(1) - (LOWMEM_KB * 1024);
- vcp->vcp_memranges[1].vmr_gpa = LOWMEM_KB * 1024;
- vcp->vcp_memranges[1].vmr_size = len;
- vcp->vcp_memranges[1].vmr_type = VM_MEM_RESERVED;
- mem_bytes -= len;
-
- /* If we have less than 2MB remaining, still create a 2nd BIOS area. */
- if (mem_bytes <= MB(2)) {
- vcp->vcp_memranges[2].vmr_gpa = VMM_PCI_MMIO_BAR_END;
- vcp->vcp_memranges[2].vmr_size = MB(2);
- vcp->vcp_memranges[2].vmr_type = VM_MEM_RESERVED;
- vcp->vcp_nmemranges = 3;
- return;
- }
-
- /*
- * Calculate the how to split any remaining memory across the 4GB
- * boundary while making sure we do not place physical memory into
- * MMIO ranges.
- */
- if (mem_bytes > VMM_PCI_MMIO_BAR_BASE - MB(1)) {
- above_1m = VMM_PCI_MMIO_BAR_BASE - MB(1);
- above_4g = mem_bytes - above_1m;
- } else {
- above_1m = mem_bytes;
- above_4g = 0;
- }
-
- /* Third memory region: area above 1MB to MMIO region */
- vcp->vcp_memranges[2].vmr_gpa = MB(1);
- vcp->vcp_memranges[2].vmr_size = above_1m;
- vcp->vcp_memranges[2].vmr_type = VM_MEM_RAM;
-
- /* Fourth region: PCI MMIO range */
- vcp->vcp_memranges[3].vmr_gpa = VMM_PCI_MMIO_BAR_BASE;
- vcp->vcp_memranges[3].vmr_size = VMM_PCI_MMIO_BAR_END -
- VMM_PCI_MMIO_BAR_BASE + 1;
- vcp->vcp_memranges[3].vmr_type = VM_MEM_MMIO;
-
- /* Fifth region: 2nd copy of BIOS above MMIO ending at 4GB */
- vcp->vcp_memranges[4].vmr_gpa = VMM_PCI_MMIO_BAR_END + 1;
- vcp->vcp_memranges[4].vmr_size = MB(2);
- vcp->vcp_memranges[4].vmr_type = VM_MEM_RESERVED;
-
- /* Sixth region: any remainder above 4GB */
- if (above_4g > 0) {
- vcp->vcp_memranges[5].vmr_gpa = GB(4);
- vcp->vcp_memranges[5].vmr_size = above_4g;
- vcp->vcp_memranges[5].vmr_type = VM_MEM_RAM;
- vcp->vcp_nmemranges = 6;
- } else
- vcp->vcp_nmemranges = 5;
-}
-
-/*
* alloc_guest_mem
*
* Allocates memory for the guest.
@@ -1190,142 +841,8 @@ vmm_create_vm(struct vmd_vm *vm)
return (0);
}
-/*
- * init_emulated_hw
- *
- * Initializes the userspace hardware emulation
- */
-void
-init_emulated_hw(struct vmop_create_params *vmc, int child_cdrom,
- int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps)
-{
- struct vm_create_params *vcp = &vmc->vmc_params;
- size_t i;
- uint64_t memlo, memhi;
-
- /* Calculate memory size for NVRAM registers */
- memlo = memhi = 0;
- for (i = 0; i < vcp->vcp_nmemranges; i++) {
- if (vcp->vcp_memranges[i].vmr_gpa == MB(1) &&
- vcp->vcp_memranges[i].vmr_size > (15 * MB(1)))
- memlo = vcp->vcp_memranges[i].vmr_size - (15 * MB(1));
- else if (vcp->vcp_memranges[i].vmr_gpa == GB(4))
- memhi = vcp->vcp_memranges[i].vmr_size;
- }
-
- /* Reset the IO port map */
- memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS);
-
- /* Init i8253 PIT */
- i8253_init(vcp->vcp_id);
- ioports_map[TIMER_CTRL] = vcpu_exit_i8253;
- ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253;
- ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253;
- ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253;
- ioports_map[PCKBC_AUX] = vcpu_exit_i8253_misc;
-
- /* Init mc146818 RTC */
- mc146818_init(vcp->vcp_id, memlo, memhi);
- ioports_map[IO_RTC] = vcpu_exit_mc146818;
- ioports_map[IO_RTC + 1] = vcpu_exit_mc146818;
-
- /* Init master and slave PICs */
- i8259_init();
- ioports_map[IO_ICU1] = vcpu_exit_i8259;
- ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259;
- ioports_map[IO_ICU2] = vcpu_exit_i8259;
- ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259;
- ioports_map[ELCR0] = vcpu_exit_elcr;
- ioports_map[ELCR1] = vcpu_exit_elcr;
-
- /* Init ns8250 UART */
- ns8250_init(con_fd, vcp->vcp_id);
- for (i = COM1_DATA; i <= COM1_SCR; i++)
- ioports_map[i] = vcpu_exit_com;
-
- /* Initialize PCI */
- for (i = VM_PCI_IO_BAR_BASE; i <= VM_PCI_IO_BAR_END; i++)
- ioports_map[i] = vcpu_exit_pci;
-
- ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci;
- ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci;
- ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci;
- ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci;
- ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci;
- pci_init();
-
- /* Initialize virtio devices */
- virtio_init(current_vm, child_cdrom, child_disks, child_taps);
/*
- * Init QEMU fw_cfg interface. Must be done last for pci hardware
- * detection.
- */
- fw_cfg_init(vmc);
- ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg;
- ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg;
- ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma;
- ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma;
-}
-
-/*
- * restore_emulated_hw
- *
- * Restores the userspace hardware emulation from fd
- */
-void
-restore_emulated_hw(struct vm_create_params *vcp, int fd,
- int *child_taps, int child_disks[][VM_MAX_BASE_PER_DISK], int child_cdrom)
-{
- /* struct vm_create_params *vcp = &vmc->vmc_params; */
- int i;
- memset(&ioports_map, 0, sizeof(io_fn_t) * MAX_PORTS);
-
- /* Init i8253 PIT */
- i8253_restore(fd, vcp->vcp_id);
- ioports_map[TIMER_CTRL] = vcpu_exit_i8253;
- ioports_map[TIMER_BASE + TIMER_CNTR0] = vcpu_exit_i8253;
- ioports_map[TIMER_BASE + TIMER_CNTR1] = vcpu_exit_i8253;
- ioports_map[TIMER_BASE + TIMER_CNTR2] = vcpu_exit_i8253;
-
- /* Init master and slave PICs */
- i8259_restore(fd);
- ioports_map[IO_ICU1] = vcpu_exit_i8259;
- ioports_map[IO_ICU1 + 1] = vcpu_exit_i8259;
- ioports_map[IO_ICU2] = vcpu_exit_i8259;
- ioports_map[IO_ICU2 + 1] = vcpu_exit_i8259;
-
- /* Init ns8250 UART */
- ns8250_restore(fd, con_fd, vcp->vcp_id);
- for (i = COM1_DATA; i <= COM1_SCR; i++)
- ioports_map[i] = vcpu_exit_com;
-
- /* Init mc146818 RTC */
- mc146818_restore(fd, vcp->vcp_id);
- ioports_map[IO_RTC] = vcpu_exit_mc146818;
- ioports_map[IO_RTC + 1] = vcpu_exit_mc146818;
-
- /* Init QEMU fw_cfg interface */
- fw_cfg_restore(fd);
- ioports_map[FW_CFG_IO_SELECT] = vcpu_exit_fw_cfg;
- ioports_map[FW_CFG_IO_DATA] = vcpu_exit_fw_cfg;
- ioports_map[FW_CFG_IO_DMA_ADDR_HIGH] = vcpu_exit_fw_cfg_dma;
- ioports_map[FW_CFG_IO_DMA_ADDR_LOW] = vcpu_exit_fw_cfg_dma;
-
- /* Initialize PCI */
- for (i = VM_PCI_IO_BAR_BASE; i <= VM_PCI_IO_BAR_END; i++)
- ioports_map[i] = vcpu_exit_pci;
-
- ioports_map[PCI_MODE1_ADDRESS_REG] = vcpu_exit_pci;
- ioports_map[PCI_MODE1_DATA_REG] = vcpu_exit_pci;
- ioports_map[PCI_MODE1_DATA_REG + 1] = vcpu_exit_pci;
- ioports_map[PCI_MODE1_DATA_REG + 2] = vcpu_exit_pci;
- ioports_map[PCI_MODE1_DATA_REG + 3] = vcpu_exit_pci;
- pci_restore(fd);
- virtio_restore(fd, current_vm, child_cdrom, child_disks, child_taps);
-}
-
-/*
* run_vm
*
* Runs the VM whose creation parameters are specified in vcp
@@ -1525,7 +1042,7 @@ run_vm(struct vmop_create_params *vmc, struct vcpu_reg_state *vrs)
return (ret);
}
-void *
+static void *
event_thread(void *arg)
{
uint8_t *donep = arg;
@@ -1555,7 +1072,7 @@ event_thread(void *arg)
* NULL: the VCPU shutdown properly
* !NULL: error processing VCPU run, or the VCPU shutdown abnormally
*/
-void *
+static void *
vcpu_run_loop(void *arg)
{
struct vm_run_params *vrp = (struct vm_run_params *)arg;
@@ -1593,7 +1110,7 @@ vcpu_run_loop(void *arg)
return ((void *)ret);
}
- /* i8259 may be firing as we pause, release run mtx. */
+ /* Interrupt may be firing, release run mtx. */
mutex_unlock(&vcpu_run_mtx[n]);
ret = pthread_cond_wait(&vcpu_unpause_cond[n],
&vcpu_unpause_mtx[n]);
@@ -1636,14 +1153,14 @@ vcpu_run_loop(void *arg)
break;
}
- if (vrp->vrp_irqready && i8259_is_pending()) {
- vrp->vrp_inject.vie_vector = i8259_ack();
+ if (vrp->vrp_irqready && intr_pending(current_vm)) {
+ vrp->vrp_inject.vie_vector = intr_ack(current_vm);
vrp->vrp_inject.vie_type = VCPU_INJECT_INTR;
} else
vrp->vrp_inject.vie_type = VCPU_INJECT_NONE;
/* Still more interrupts pending? */
- vrp->vrp_intr_pending = i8259_is_pending();
+ vrp->vrp_intr_pending = intr_pending(current_vm);
if (ioctl(env->vmd_fd, VMM_IOC_RUN, vrp) == -1) {
/* If run ioctl failed, exit */
@@ -1682,7 +1199,7 @@ vcpu_run_loop(void *arg)
}
int
-vcpu_pic_intr(uint32_t vm_id, uint32_t vcpu_id, uint8_t intr)
+vcpu_intr(uint32_t vm_id, uint32_t vcpu_id, uint8_t intr)
{
struct vm_intr_params vip;
@@ -1699,503 +1216,6 @@ vcpu_pic_intr(uint32_t vm_id, uint32_t vcpu_id, uint8_t intr)
}
/*
- * vcpu_exit_pci
- *
- * Handle all I/O to the emulated PCI subsystem.
- *
- * Parameters:
- * vrp: vcpu run parameters containing guest state for this exit
- *
- * Return value:
- * Interrupt to inject to the guest VM, or 0xFF if no interrupt should
- * be injected.
- */
-uint8_t
-vcpu_exit_pci(struct vm_run_params *vrp)
-{
- struct vm_exit *vei = vrp->vrp_exit;
- uint8_t intr;
-
- intr = 0xFF;
-
- switch (vei->vei.vei_port) {
- case PCI_MODE1_ADDRESS_REG:
- pci_handle_address_reg(vrp);
- break;
- case PCI_MODE1_DATA_REG:
- case PCI_MODE1_DATA_REG + 1:
- case PCI_MODE1_DATA_REG + 2:
- case PCI_MODE1_DATA_REG + 3:
- pci_handle_data_reg(vrp);
- break;
- case VM_PCI_IO_BAR_BASE ... VM_PCI_IO_BAR_END:
- intr = pci_handle_io(vrp);
- break;
- default:
- log_warnx("%s: unknown PCI register 0x%llx",
- __progname, (uint64_t)vei->vei.vei_port);
- break;
- }
-
- return (intr);
-}
-
-/*
- * vcpu_exit_inout
- *
- * Handle all I/O exits that need to be emulated in vmd. This includes the
- * i8253 PIT, the com1 ns8250 UART, and the MC146818 RTC/NVRAM device.
- *
- * Parameters:
- * vrp: vcpu run parameters containing guest state for this exit
- */
-void
-vcpu_exit_inout(struct vm_run_params *vrp)
-{
- struct vm_exit *vei = vrp->vrp_exit;
- uint8_t intr = 0xFF;
-
- if (vei->vei.vei_rep || vei->vei.vei_string) {
-#ifdef MMIO_DEBUG
- log_info("%s: %s%s%s %d-byte, enc=%d, data=0x%08x, port=0x%04x",
- __func__,
- vei->vei.vei_rep == 0 ? "" : "REP ",
- vei->vei.vei_dir == VEI_DIR_IN ? "IN" : "OUT",
- vei->vei.vei_string == 0 ? "" : "S",
- vei->vei.vei_size, vei->vei.vei_encoding,
- vei->vei.vei_data, vei->vei.vei_port);
- log_info("%s: ECX = 0x%llx, RDX = 0x%llx, RSI = 0x%llx",
- __func__,
- vei->vrs.vrs_gprs[VCPU_REGS_RCX],
- vei->vrs.vrs_gprs[VCPU_REGS_RDX],
- vei->vrs.vrs_gprs[VCPU_REGS_RSI]);
-#endif /* MMIO_DEBUG */
- fatalx("%s: can't emulate REP prefixed IN(S)/OUT(S)",
- __func__);
- }
-
- if (ioports_map[vei->vei.vei_port] != NULL)
- intr = ioports_map[vei->vei.vei_port](vrp);
- else if (vei->vei.vei_dir == VEI_DIR_IN)
- set_return_data(vei, 0xFFFFFFFF);
-
- vei->vrs.vrs_gprs[VCPU_REGS_RIP] += vei->vei.vei_insn_len;
-
- if (intr != 0xFF)
- vcpu_assert_pic_irq(vrp->vrp_vm_id, vrp->vrp_vcpu_id, intr);
-}
-
-/*
- * vcpu_exit_eptviolation
- *
- * handle an EPT Violation
- *
- * Parameters:
- * vrp: vcpu run parameters containing guest state for this exit
- *
- * Return values:
- * 0: no action required
- * EFAULT: a protection fault occured, kill the vm.
- */
-int
-vcpu_exit_eptviolation(struct vm_run_params *vrp)
-{
- struct vm_exit *ve = vrp->vrp_exit;
- int ret = 0;
-#if MMIO_NOTYET
- struct x86_insn insn;
- uint64_t va, pa;
- size_t len = 15; /* Max instruction length in x86. */
-#endif /* MMIO_NOTYET */
- switch (ve->vee.vee_fault_type) {
- case VEE_FAULT_HANDLED:
- break;
-
-#if MMIO_NOTYET
- case VEE_FAULT_MMIO_ASSIST:
- /* Intel VMX might give us the length of the instruction. */
- if (ve->vee.vee_insn_info & VEE_LEN_VALID)
- len = ve->vee.vee_insn_len;
-
- if (len > 15)
- fatalx("%s: invalid instruction length %lu", __func__,
- len);
-
- /* If we weren't given instruction bytes, we need to fetch. */
- if (!(ve->vee.vee_insn_info & VEE_BYTES_VALID)) {
- memset(ve->vee.vee_insn_bytes, 0,
- sizeof(ve->vee.vee_insn_bytes));
- va = ve->vrs.vrs_gprs[VCPU_REGS_RIP];
-
- /* XXX Only support instructions that fit on 1 page. */
- if ((va & PAGE_MASK) + len > PAGE_SIZE) {
- log_warnx("%s: instruction might cross page "
- "boundary", __func__);
- ret = EINVAL;
- break;
- }
-
- ret = translate_gva(ve, va, &pa, PROT_EXEC);
- if (ret != 0) {
- log_warnx("%s: failed gva translation",
- __func__);
- break;
- }
-
- ret = read_mem(pa, ve->vee.vee_insn_bytes, len);
- if (ret != 0) {
- log_warnx("%s: failed to fetch instruction "
- "bytes from 0x%llx", __func__, pa);
- break;
- }
- }
-
- ret = insn_decode(ve, &insn);
- if (ret == 0)
- ret = insn_emulate(ve, &insn);
- break;
-#endif /* MMIO_NOTYET */
-
- case VEE_FAULT_PROTECT:
- log_debug("%s: EPT Violation: rip=0x%llx", __progname,
- ve->vrs.vrs_gprs[VCPU_REGS_RIP]);
- ret = EFAULT;
- break;
-
- default:
- fatalx("%s: invalid fault_type %d", __progname,
- ve->vee.vee_fault_type);
- /* UNREACHED */
- }
-
- return (ret);
-}
-
-/*
- * vcpu_exit
- *
- * Handle a vcpu exit. This function is called when it is determined that
- * vmm(4) requires the assistance of vmd to support a particular guest
- * exit type (eg, accessing an I/O port or device). Guest state is contained
- * in 'vrp', and will be resent to vmm(4) on exit completion.
- *
- * Upon conclusion of handling the exit, the function determines if any
- * interrupts should be injected into the guest, and asserts the proper
- * IRQ line whose interrupt should be vectored.
- *
- * Parameters:
- * vrp: vcpu run parameters containing guest state for this exit
- *
- * Return values:
- * 0: the exit was handled successfully
- * 1: an error occurred (eg, unknown exit reason passed in 'vrp')
- */
-int
-vcpu_exit(struct vm_run_params *vrp)
-{
- int ret;
-
- switch (vrp->vrp_exit_reason) {
- case VMX_EXIT_INT_WINDOW:
- case SVM_VMEXIT_VINTR:
- case VMX_EXIT_CPUID:
- case VMX_EXIT_EXTINT:
- case SVM_VMEXIT_INTR:
- case SVM_VMEXIT_MSR:
- case SVM_VMEXIT_CPUID:
- /*
- * We may be exiting to vmd to handle a pending interrupt but
- * at the same time the last exit type may have been one of
- * these. In this case, there's nothing extra to be done
- * here (and falling through to the default case below results
- * in more vmd log spam).
- */
- break;
- case SVM_VMEXIT_NPF:
- case VMX_EXIT_EPT_VIOLATION:
- ret = vcpu_exit_eptviolation(vrp);
- if (ret)
- return (ret);
- break;
- case VMX_EXIT_IO:
- case SVM_VMEXIT_IOIO:
- vcpu_exit_inout(vrp);
- break;
- case VMX_EXIT_HLT:
- case SVM_VMEXIT_HLT:
- mutex_lock(&vm_mtx);
- vcpu_hlt[vrp->vrp_vcpu_id] = 1;
- mutex_unlock(&vm_mtx);
- break;
- case VMX_EXIT_TRIPLE_FAULT:
- case SVM_VMEXIT_SHUTDOWN:
- /* reset VM */
- return (EAGAIN);
- default:
- log_debug("%s: unknown exit reason 0x%x",
- __progname, vrp->vrp_exit_reason);
- }
-
- return (0);
-}
-
-/*
- * find_gpa_range
- *
- * Search for a contiguous guest physical mem range.
- *
- * Parameters:
- * vcp: VM create parameters that contain the memory map to search in
- * gpa: the starting guest physical address
- * len: the length of the memory range
- *
- * Return values:
- * NULL: on failure if there is no memory range as described by the parameters
- * Pointer to vm_mem_range that contains the start of the range otherwise.
- */
-static struct vm_mem_range *
-find_gpa_range(struct vm_create_params *vcp, paddr_t gpa, size_t len)
-{
- size_t i, n;
- struct vm_mem_range *vmr;
-
- /* Find the first vm_mem_range that contains gpa */
- for (i = 0; i < vcp->vcp_nmemranges; i++) {
- vmr = &vcp->vcp_memranges[i];
- if (gpa < vmr->vmr_gpa + vmr->vmr_size)
- break;
- }
-
- /* No range found. */
- if (i == vcp->vcp_nmemranges)
- return (NULL);
-
- /*
- * vmr may cover the range [gpa, gpa + len) only partly. Make
- * sure that the following vm_mem_ranges are contiguous and
- * cover the rest.
- */
- n = vmr->vmr_size - (gpa - vmr->vmr_gpa);
- if (len < n)
- len = 0;
- else
- len -= n;
- gpa = vmr->vmr_gpa + vmr->vmr_size;
- for (i = i + 1; len != 0 && i < vcp->vcp_nmemranges; i++) {
- vmr = &vcp->vcp_memranges[i];
- if (gpa != vmr->vmr_gpa)
- return (NULL);
- if (len <= vmr->vmr_size)
- len = 0;
- else
- len -= vmr->vmr_size;
-
- gpa = vmr->vmr_gpa + vmr->vmr_size;
- }
-
- if (len != 0)
- return (NULL);
-
- return (vmr);
-}
-
-/*
- * write_mem
- *
- * Copies data from 'buf' into the guest VM's memory at paddr 'dst'.
- *
- * Parameters:
- * dst: the destination paddr_t in the guest VM
- * buf: data to copy (or NULL to zero the data)
- * len: number of bytes to copy
- *
- * Return values:
- * 0: success
- * EINVAL: if the guest physical memory range [dst, dst + len) does not
- * exist in the guest.
- */
-int
-write_mem(paddr_t dst, const void *buf, size_t len)
-{
- const char *from = buf;
- char *to;
- size_t n, off;
- struct vm_mem_range *vmr;
-
- vmr = find_gpa_range(&current_vm->vm_params.vmc_params, dst, len);
- if (vmr == NULL) {
- errno = EINVAL;
- log_warn("%s: failed - invalid memory range dst = 0x%lx, "
- "len = 0x%zx", __func__, dst, len);
- return (EINVAL);
- }
-
- off = dst - vmr->vmr_gpa;
- while (len != 0) {
- n = vmr->vmr_size - off;
- if (len < n)
- n = len;
-
- to = (char *)vmr->vmr_va + off;
- if (buf == NULL)
- memset(to, 0, n);
- else {
- memcpy(to, from, n);
- from += n;
- }
- len -= n;
- off = 0;
- vmr++;
- }
-
- return (0);
-}
-
-/*
- * read_mem
- *
- * Reads memory at guest paddr 'src' into 'buf'.
- *
- * Parameters:
- * src: the source paddr_t in the guest VM to read from.
- * buf: destination (local) buffer
- * len: number of bytes to read
- *
- * Return values:
- * 0: success
- * EINVAL: if the guest physical memory range [dst, dst + len) does not
- * exist in the guest.
- */
-int
-read_mem(paddr_t src, void *buf, size_t len)
-{
- char *from, *to = buf;
- size_t n, off;
- struct vm_mem_range *vmr;
-
- vmr = find_gpa_range(&current_vm->vm_params.vmc_params, src, len);
- if (vmr == NULL) {
- errno = EINVAL;
- log_warn("%s: failed - invalid memory range src = 0x%lx, "
- "len = 0x%zx", __func__, src, len);
- return (EINVAL);
- }
-
- off = src - vmr->vmr_gpa;
- while (len != 0) {
- n = vmr->vmr_size - off;
- if (len < n)
- n = len;
-
- from = (char *)vmr->vmr_va + off;
- memcpy(to, from, n);
-
- to += n;
- len -= n;
- off = 0;
- vmr++;
- }
-
- return (0);
-}
-
-/*
- * hvaddr_mem
- *
- * Translate a guest physical address to a host virtual address, checking the
- * provided memory range length to confirm it's contiguous within the same
- * guest memory range (vm_mem_range).
- *
- * Parameters:
- * gpa: guest physical address to translate
- * len: number of bytes in the intended range
- *
- * Return values:
- * void* to host virtual memory on success
- * NULL on error, setting errno to:
- * EFAULT: gpa falls outside guest memory ranges
- * EINVAL: requested len extends beyond memory range
- */
-void *
-hvaddr_mem(paddr_t gpa, size_t len)
-{
- struct vm_mem_range *vmr;
- size_t off;
-
- vmr = find_gpa_range(&current_vm->vm_params.vmc_params, gpa, len);
- if (vmr == NULL) {
- log_warnx("%s: failed - invalid gpa: 0x%lx\n", __func__, gpa);
- errno = EFAULT;
- return (NULL);
- }
-
- off = gpa - vmr->vmr_gpa;
- if (len > (vmr->vmr_size - off)) {
- log_warnx("%s: failed - invalid memory range: gpa=0x%lx, "
- "len=%zu", __func__, gpa, len);
- errno = EINVAL;
- return (NULL);
- }
-
- return ((char *)vmr->vmr_va + off);
-}
-
-/*
- * vcpu_assert_pic_irq
- *
- * Injects the specified IRQ on the supplied vcpu/vm
- *
- * Parameters:
- * vm_id: VM ID to inject to
- * vcpu_id: VCPU ID to inject to
- * irq: IRQ to inject
- */
-void
-vcpu_assert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq)
-{
- int ret;
-
- i8259_assert_irq(irq);
-
- if (i8259_is_pending()) {
- if (vcpu_pic_intr(vm_id, vcpu_id, 1))
- fatalx("%s: can't assert INTR", __func__);
-
- mutex_lock(&vm_mtx);
- vcpu_hlt[vcpu_id] = 0;
- mutex_unlock(&vm_mtx);
-
- mutex_lock(&vcpu_run_mtx[vcpu_id]);
- ret = pthread_cond_signal(&vcpu_run_cond[vcpu_id]);
- if (ret)
- fatalx("%s: can't signal (%d)", __func__, ret);
- mutex_unlock(&vcpu_run_mtx[vcpu_id]);
- }
-}
-
-/*
- * vcpu_deassert_pic_irq
- *
- * Clears the specified IRQ on the supplied vcpu/vm
- *
- * Parameters:
- * vm_id: VM ID to clear in
- * vcpu_id: VCPU ID to clear in
- * irq: IRQ to clear
- */
-void
-vcpu_deassert_pic_irq(uint32_t vm_id, uint32_t vcpu_id, int irq)
-{
- i8259_deassert_irq(irq);
-
- if (!i8259_is_pending()) {
- if (vcpu_pic_intr(vm_id, vcpu_id, 0))
- fatalx("%s: can't deassert INTR for vm_id %d, "
- "vcpu_id %d", __func__, vm_id, vcpu_id);
- }
-}
-
-/*
* fd_hasdata
*
* Determines if data can be read from a file descriptor.
@@ -2258,203 +1278,6 @@ mutex_unlock(pthread_mutex_t *m)
}
}
-/*
- * set_return_data
- *
- * Utility function for manipulating register data in vm exit info structs. This
- * function ensures that the data is copied to the vei->vei.vei_data field with
- * the proper size for the operation being performed.
- *
- * Parameters:
- * vei: exit information
- * data: return data
- */
-void
-set_return_data(struct vm_exit *vei, uint32_t data)
-{
- switch (vei->vei.vei_size) {
- case 1:
- vei->vei.vei_data &= ~0xFF;
- vei->vei.vei_data |= (uint8_t)data;
- break;
- case 2:
- vei->vei.vei_data &= ~0xFFFF;
- vei->vei.vei_data |= (uint16_t)data;
- break;
- case 4:
- vei->vei.vei_data = data;
- break;
- }
-}
-
-/*
- * get_input_data
- *
- * Utility function for manipulating register data in vm exit info
- * structs. This function ensures that the data is copied from the
- * vei->vei.vei_data field with the proper size for the operation being
- * performed.
- *
- * Parameters:
- * vei: exit information
- * data: location to store the result
- */
-void
-get_input_data(struct vm_exit *vei, uint32_t *data)
-{
- switch (vei->vei.vei_size) {
- case 1:
- *data &= 0xFFFFFF00;
- *data |= (uint8_t)vei->vei.vei_data;
- break;
- case 2:
- *data &= 0xFFFF0000;
- *data |= (uint16_t)vei->vei.vei_data;
- break;
- case 4:
- *data = vei->vei.vei_data;
- break;
- default:
- log_warnx("%s: invalid i/o size %d", __func__,
- vei->vei.vei_size);
- }
-
-}
-
-/*
- * translate_gva
- *
- * Translates a guest virtual address to a guest physical address by walking
- * the currently active page table (if needed).
- *
- * XXX ensure translate_gva updates the A bit in the PTE
- * XXX ensure translate_gva respects segment base and limits in i386 mode
- * XXX ensure translate_gva respects segment wraparound in i8086 mode
- * XXX ensure translate_gva updates the A bit in the segment selector
- * XXX ensure translate_gva respects CR4.LMSLE if available
- *
- * Parameters:
- * exit: The VCPU this translation should be performed for (guest MMU settings
- * are gathered from this VCPU)
- * va: virtual address to translate
- * pa: pointer to paddr_t variable that will receive the translated physical
- * address. 'pa' is unchanged on error.
- * mode: one of PROT_READ, PROT_WRITE, PROT_EXEC indicating the mode in which
- * the address should be translated
- *
- * Return values:
- * 0: the address was successfully translated - 'pa' contains the physical
- * address currently mapped by 'va'.
- * EFAULT: the PTE for 'VA' is unmapped. A #PF will be injected in this case
- * and %cr2 set in the vcpu structure.
- * EINVAL: an error occurred reading paging table structures
- */
-int
-translate_gva(struct vm_exit* exit, uint64_t va, uint64_t* pa, int mode)
-{
- int level, shift, pdidx;
- uint64_t pte, pt_paddr, pte_paddr, mask, low_mask, high_mask;
- uint64_t shift_width, pte_size;
- struct vcpu_reg_state *vrs;
-
- vrs = &exit->vrs;
-
- if (!pa)
- return (EINVAL);
-
- if (!(vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PG)) {
- log_debug("%s: unpaged, va=pa=0x%llx", __func__, va);
- *pa = va;
- return (0);
- }
-
- pt_paddr = vrs->vrs_crs[VCPU_REGS_CR3];
-
- log_debug("%s: guest %%cr0=0x%llx, %%cr3=0x%llx", __func__,
- vrs->vrs_crs[VCPU_REGS_CR0], vrs->vrs_crs[VCPU_REGS_CR3]);
-
- if (vrs->vrs_crs[VCPU_REGS_CR0] & CR0_PE) {
- if (vrs->vrs_crs[VCPU_REGS_CR4] & CR4_PAE) {
- pte_size = sizeof(uint64_t);
- shift_width = 9;
-
- if (vrs->vrs_msrs[VCPU_REGS_EFER] & EFER_LMA) {
- /* 4 level paging */
- level = 4;
- mask = L4_MASK;
- shift = L4_SHIFT;
- } else {
- /* 32 bit with PAE paging */
- level = 3;
- mask = L3_MASK;
- shift = L3_SHIFT;
- }
- } else {
- /* 32 bit paging */
- level = 2;
- shift_width = 10;
- mask = 0xFFC00000;
- shift = 22;
- pte_size = sizeof(uint32_t);
- }
- } else
- return (EINVAL);
-
- /* XXX: Check for R bit in segment selector and set A bit */
-
- for (;level > 0; level--) {
- pdidx = (va & mask) >> shift;
- pte_paddr = (pt_paddr) + (pdidx * pte_size);
-
- log_debug("%s: read pte level %d @ GPA 0x%llx", __func__,
- level, pte_paddr);
- if (read_mem(pte_paddr, &pte, pte_size)) {
- log_warn("%s: failed to read pte", __func__);
- return (EFAULT);
- }
-
- log_debug("%s: PTE @ 0x%llx = 0x%llx", __func__, pte_paddr,
- pte);
-
- /* XXX: Set CR2 */
- if (!(pte & PG_V))
- return (EFAULT);
-
- /* XXX: Check for SMAP */
- if ((mode == PROT_WRITE) && !(pte & PG_RW))
- return (EPERM);
-
- if ((exit->cpl > 0) && !(pte & PG_u))
- return (EPERM);
-
- pte = pte | PG_U;
- if (mode == PROT_WRITE)
- pte = pte | PG_M;
- if (write_mem(pte_paddr, &pte, pte_size)) {
- log_warn("%s: failed to write back flags to pte",
- __func__);
- return (EIO);
- }
-
- /* XXX: EINVAL if in 32bit and PG_PS is 1 but CR4.PSE is 0 */
- if (pte & PG_PS)
- break;
-
- if (level > 1) {
- pt_paddr = pte & PG_FRAME;
- shift -= shift_width;
- mask = mask >> shift_width;
- }
- }
-
- low_mask = (1 << shift) - 1;
- high_mask = (((uint64_t)1ULL << ((pte_size * 8) - 1)) - 1) ^ low_mask;
- *pa = (pte & high_mask) | (va & low_mask);
-
- log_debug("%s: final GPA for GVA 0x%llx = 0x%llx\n", __func__, va, *pa);
-
- return (0);
-}
void
vm_pipe_init(struct vm_dev_pipe *p, void (*cb)(int, short, void *))
@@ -2619,3 +1442,31 @@ remap_guest_mem(struct vmd_vm *vm, int vmm_fd)
return (0);
}
+
+void
+vcpu_halt(uint32_t vcpu_id)
+{
+ mutex_lock(&vm_mtx);
+ vcpu_hlt[vcpu_id] = 1;
+ mutex_unlock(&vm_mtx);
+}
+
+void
+vcpu_unhalt(uint32_t vcpu_id)
+ {
+ mutex_lock(&vm_mtx);
+ vcpu_hlt[vcpu_id] = 0;
+ mutex_unlock(&vm_mtx);
+}
+
+void
+vcpu_signal_run(uint32_t vcpu_id)
+{
+ int ret;
+
+ mutex_lock(&vcpu_run_mtx[vcpu_id]);
+ ret = pthread_cond_signal(&vcpu_run_cond[vcpu_id]);
+ if (ret)
+ fatalx("%s: can't signal (%d)", __func__, ret);
+ mutex_unlock(&vcpu_run_mtx[vcpu_id]);
+}
diff --git a/usr.sbin/vmd/vmd.c b/usr.sbin/vmd/vmd.c
index 3c053ae08a2..232bc82d8d2 100644
--- a/usr.sbin/vmd/vmd.c
+++ b/usr.sbin/vmd/vmd.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: vmd.c,v 1.158 2024/07/09 09:31:37 dv Exp $ */
+/* $OpenBSD: vmd.c,v 1.159 2024/07/10 09:27:33 dv Exp $ */
/*
* Copyright (c) 2015 Reyk Floeter <reyk@openbsd.org>
@@ -41,7 +41,6 @@
#include <grp.h>
#include <dev/vmm/vmm.h>
-#include <machine/specialreg.h>
#include "proc.h"
#include "atomicio.h"
@@ -613,134 +612,6 @@ vmd_dispatch_priv(int fd, struct privsep_proc *p, struct imsg *imsg)
return (0);
}
-int
-vmd_check_vmh(struct vm_dump_header *vmh)
-{
- int i;
- unsigned int code, leaf;
- unsigned int a, b, c, d;
-
- if (strncmp(vmh->vmh_signature, VM_DUMP_SIGNATURE, strlen(VM_DUMP_SIGNATURE)) != 0) {
- log_warnx("%s: incompatible dump signature", __func__);
- return (-1);
- }
-
- if (vmh->vmh_version != VM_DUMP_VERSION) {
- log_warnx("%s: incompatible dump version", __func__);
- return (-1);
- }
-
- for (i = 0; i < VM_DUMP_HEADER_CPUID_COUNT; i++) {
- code = vmh->vmh_cpuids[i].code;
- leaf = vmh->vmh_cpuids[i].leaf;
- if (leaf != 0x00) {
- log_debug("%s: invalid leaf 0x%x for code 0x%x",
- __func__, leaf, code);
- return (-1);
- }
-
- switch (code) {
- case 0x00:
- CPUID_LEAF(code, leaf, a, b, c, d);
- if (vmh->vmh_cpuids[i].a > a) {
- log_debug("%s: incompatible cpuid level",
- __func__);
- return (-1);
- }
- if (!(vmh->vmh_cpuids[i].b == b &&
- vmh->vmh_cpuids[i].c == c &&
- vmh->vmh_cpuids[i].d == d)) {
- log_debug("%s: incompatible cpu brand",
- __func__);
- return (-1);
- }
- break;
-
- case 0x01:
- CPUID_LEAF(code, leaf, a, b, c, d);
- if ((vmh->vmh_cpuids[i].c & c & VMM_CPUIDECX_MASK) !=
- (vmh->vmh_cpuids[i].c & VMM_CPUIDECX_MASK)) {
- log_debug("%s: incompatible cpu features "
- "code: 0x%x leaf: 0x%x reg: c", __func__,
- code, leaf);
- return (-1);
- }
- if ((vmh->vmh_cpuids[i].d & d & VMM_CPUIDEDX_MASK) !=
- (vmh->vmh_cpuids[i].d & VMM_CPUIDEDX_MASK)) {
- log_debug("%s: incompatible cpu features "
- "code: 0x%x leaf: 0x%x reg: d", __func__,
- code, leaf);
- return (-1);
- }
- break;
-
- case 0x07:
- CPUID_LEAF(code, leaf, a, b, c, d);
- if ((vmh->vmh_cpuids[i].b & b & VMM_SEFF0EBX_MASK) !=
- (vmh->vmh_cpuids[i].b & VMM_SEFF0EBX_MASK)) {
- log_debug("%s: incompatible cpu features "
- "code: 0x%x leaf: 0x%x reg: c", __func__,
- code, leaf);
- return (-1);
- }
- if ((vmh->vmh_cpuids[i].c & c & VMM_SEFF0ECX_MASK) !=
- (vmh->vmh_cpuids[i].c & VMM_SEFF0ECX_MASK)) {
- log_debug("%s: incompatible cpu features "
- "code: 0x%x leaf: 0x%x reg: d", __func__,
- code, leaf);
- return (-1);
- }
- break;
-
- case 0x0d:
- CPUID_LEAF(code, leaf, a, b, c, d);
- if (vmh->vmh_cpuids[i].b > b) {
- log_debug("%s: incompatible cpu: insufficient "
- "max save area for enabled XCR0 features",
- __func__);
- return (-1);
- }
- if (vmh->vmh_cpuids[i].c > c) {
- log_debug("%s: incompatible cpu: insufficient "
- "max save area for supported XCR0 features",
- __func__);
- return (-1);
- }
- break;
-
- case 0x80000001:
- CPUID_LEAF(code, leaf, a, b, c, d);
- if ((vmh->vmh_cpuids[i].a & a) !=
- vmh->vmh_cpuids[i].a) {
- log_debug("%s: incompatible cpu features "
- "code: 0x%x leaf: 0x%x reg: a", __func__,
- code, leaf);
- return (-1);
- }
- if ((vmh->vmh_cpuids[i].c & c) !=
- vmh->vmh_cpuids[i].c) {
- log_debug("%s: incompatible cpu features "
- "code: 0x%x leaf: 0x%x reg: c", __func__,
- code, leaf);
- return (-1);
- }
- if ((vmh->vmh_cpuids[i].d & d) !=
- vmh->vmh_cpuids[i].d) {
- log_debug("%s: incompatible cpu features "
- "code: 0x%x leaf: 0x%x reg: d", __func__,
- code, leaf);
- return (-1);
- }
- break;
-
- default:
- log_debug("%s: unknown code 0x%x", __func__, code);
- return (-1);
- }
- }
-
- return (0);
-}
void
vmd_sighdlr(int sig, short event, void *arg)
diff --git a/usr.sbin/vmd/vmd.h b/usr.sbin/vmd/vmd.h
index 4f1b05e7058..2f2056541c8 100644
--- a/usr.sbin/vmd/vmd.h
+++ b/usr.sbin/vmd/vmd.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: vmd.h,v 1.126 2024/07/09 09:31:37 dv Exp $ */
+/* $OpenBSD: vmd.h,v 1.127 2024/07/10 09:27:33 dv Exp $ */
/*
* Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
@@ -43,6 +43,9 @@
#define nitems(_a) (sizeof((_a)) / sizeof((_a)[0]))
+#define MB(x) (x * 1024UL * 1024UL)
+#define GB(x) (x * 1024UL * 1024UL * 1024UL)
+
#define VMD_USER "_vmd"
#define VMD_CONF "/etc/vm.conf"
#define SOCKET_NAME "/var/run/vmd.sock"
@@ -492,21 +495,51 @@ int opentap(char *);
int fd_hasdata(int);
int vmm_pipe(struct vmd_vm *, int, void (*)(int, short, void *));
-/* vm.c */
+/* {mach}_vm.c (md interface) */
+void create_memory_map(struct vm_create_params *);
+int load_firmware(struct vmd_vm *, struct vcpu_reg_state *);
+void init_emulated_hw(struct vmop_create_params *, int,
+ int[][VM_MAX_BASE_PER_DISK], int *);
+void restore_emulated_hw(struct vm_create_params *vcp, int, int *,
+ int[][VM_MAX_BASE_PER_DISK], int);
+int vcpu_reset(uint32_t, uint32_t, struct vcpu_reg_state *);
+void pause_vm_md(struct vmd_vm *);
+void unpause_vm_md(struct vmd_vm *);
+int dump_devs(int);
+int dump_send_header(int);
+void *hvaddr_mem(paddr_t, size_t);
+int write_mem(paddr_t, const void *, size_t);
+int read_mem(paddr_t, void *, size_t);
+int intr_ack(struct vmd_vm *);
+int intr_pending(struct vmd_vm *);
+void intr_toggle_el(struct vmd_vm *, int, int);
+void vcpu_assert_irq(uint32_t, uint32_t, int);
+void vcpu_deassert_irq(uint32_t, uint32_t, int);
+int vcpu_exit(struct vm_run_params *);
+uint8_t vcpu_exit_pci(struct vm_run_params *);
+
+#ifdef __amd64__
+/* x86 io functions in x86_vm.c */
+void set_return_data(struct vm_exit *, uint32_t);
+void get_input_data(struct vm_exit *, uint32_t *);
+#endif /* __amd64 __ */
+
+/* vm.c (mi functions) */
+void vcpu_halt(uint32_t);
+void vcpu_unhalt(uint32_t);
+void vcpu_signal_run(uint32_t);
+int vcpu_intr(uint32_t, uint32_t, uint8_t);
void vm_main(int, int);
void mutex_lock(pthread_mutex_t *);
void mutex_unlock(pthread_mutex_t *);
-int read_mem(paddr_t, void *buf, size_t);
-int start_vm(struct vmd_vm *, int);
-__dead void vm_shutdown(unsigned int);
+int vmd_check_vmh(struct vm_dump_header *);
void vm_pipe_init(struct vm_dev_pipe *, void (*)(int, short, void *));
void vm_pipe_init2(struct vm_dev_pipe *, void (*)(int, short, void *),
void *);
void vm_pipe_send(struct vm_dev_pipe *, enum pipe_msg_type);
enum pipe_msg_type vm_pipe_recv(struct vm_dev_pipe *);
-int write_mem(paddr_t, const void *buf, size_t);
-void* hvaddr_mem(paddr_t, size_t);
int remap_guest_mem(struct vmd_vm *, int);
+__dead void vm_shutdown(unsigned int);
/* config.c */
int config_init(struct vmd *);
diff --git a/usr.sbin/vmd/vmm.c b/usr.sbin/vmd/vmm.c
index 70c94c0dff8..6a98e43f751 100644
--- a/usr.sbin/vmd/vmm.c
+++ b/usr.sbin/vmd/vmm.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: vmm.c,v 1.120 2024/07/09 09:31:37 dv Exp $ */
+/* $OpenBSD: vmm.c,v 1.121 2024/07/10 09:27:33 dv Exp $ */
/*
* Copyright (c) 2015 Mike Larkin <mlarkin@openbsd.org>
@@ -30,9 +30,6 @@
#include <dev/pci/pcireg.h>
#include <dev/vmm/vmm.h>
-#include <machine/psl.h>
-#include <machine/specialreg.h>
-
#include <net/if.h>
#include <errno.h>
@@ -50,7 +47,6 @@
#include <util.h>
#include "vmd.h"
-#include "vmm.h"
#include "atomicio.h"
#include "proc.h"