diff options
author | Dave Voutila <dv@cvs.openbsd.org> | 2024-01-30 23:01:50 +0000 |
---|---|---|
committer | Dave Voutila <dv@cvs.openbsd.org> | 2024-01-30 23:01:50 +0000 |
commit | 29c27b0d5072c37d4d3f9de540a9ad0bc0278324 (patch) | |
tree | f77447ea6d0bd7de51bbba1a1f149b909082ef60 /usr.sbin/vmd/vionet.c | |
parent | ca563e138bc87756fd59e84dfccc717388b03109 (diff) |
Rewrite vmd(8)'s vionet to be zero-copy.
Similar to the rewrite of the virtio block device to use zero-copy
semantics, this rewrites how the virtio network device works with
the virtqueue ring buffers to minimize data copying. For guests
that don't use the built-in DNS and mac filtering capabilities,
data can now be transfered to/from the virtqueue and the tap(4)
directly without temporary buffers.
A lot of the virtio semantics are cleaned up as well, including
proper error states.
Tested with help by mbuhl@, friehm@, mlarkin@, and others.
"go for it," mlarkin@
Diffstat (limited to 'usr.sbin/vmd/vionet.c')
-rw-r--r-- | usr.sbin/vmd/vionet.c | 716 |
1 files changed, 432 insertions, 284 deletions
diff --git a/usr.sbin/vmd/vionet.c b/usr.sbin/vmd/vionet.c index 810adde2e88..b973577fad9 100644 --- a/usr.sbin/vmd/vionet.c +++ b/usr.sbin/vmd/vionet.c @@ -1,4 +1,4 @@ -/* $OpenBSD: vionet.c,v 1.7 2024/01/03 03:14:16 dv Exp $ */ +/* $OpenBSD: vionet.c,v 1.8 2024/01/30 23:01:49 dv Exp $ */ /* * Copyright (c) 2023 Dave Voutila <dv@openbsd.org> @@ -16,9 +16,8 @@ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ -#include <sys/mman.h> -#include <sys/param.h> /* PAGE_SIZE */ #include <sys/socket.h> +#include <sys/types.h> #include <dev/pci/virtio_pcireg.h> #include <dev/pv/virtioreg.h> @@ -26,7 +25,6 @@ #include <net/if.h> #include <netinet/in.h> #include <netinet/if_ether.h> -#include <netinet/ip.h> #include <errno.h> #include <event.h> @@ -36,7 +34,6 @@ #include <unistd.h> #include "atomicio.h" -#include "pci.h" #include "virtio.h" #include "vmd.h" @@ -47,20 +44,36 @@ extern char *__progname; extern struct vmd_vm *current_vm; -/* Device Globals */ -struct event ev_tap; +struct packet { + uint8_t *buf; + size_t len; +}; -static int vionet_rx(struct vionet_dev *); +static int vionet_rx(struct vionet_dev *, int); +static ssize_t vionet_rx_copy(struct vionet_dev *, int, const struct iovec *, + int, size_t); +static ssize_t vionet_rx_zerocopy(struct vionet_dev *, int, + const struct iovec *, int); static void vionet_rx_event(int, short, void *); static uint32_t handle_io_read(struct viodev_msg *, struct virtio_dev *, int8_t *); static int handle_io_write(struct viodev_msg *, struct virtio_dev *); -void vionet_notify_rx(struct virtio_dev *); -int vionet_notifyq(struct virtio_dev *); +static int vionet_notify_tx(struct virtio_dev *); +static int vionet_notifyq(struct virtio_dev *); static void dev_dispatch_vm(int, short, void *); static void handle_sync_io(int, short, void *); +/* Device Globals */ +struct event ev_tap; +struct event ev_inject; +int pipe_inject[2]; +#define READ 0 +#define WRITE 1 +struct iovec iov_rx[VIONET_QUEUE_SIZE]; +struct iovec iov_tx[VIONET_QUEUE_SIZE]; + + __dead void vionet_main(int fd, int fd_vmm) { @@ -79,6 +92,10 @@ vionet_main(int fd, int fd_vmm) if (pledge("stdio vmm proc", NULL) == -1) fatal("pledge"); + /* Initialize iovec arrays. */ + memset(iov_rx, 0, sizeof(iov_rx)); + memset(iov_tx, 0, sizeof(iov_tx)); + /* Receive our vionet_dev, mostly preconfigured. */ sz = atomicio(read, fd, &dev, sizeof(dev)); if (sz != sizeof(dev)) { @@ -153,6 +170,12 @@ vionet_main(int fd, int fd_vmm) } } + /* Initialize our packet injection pipe. */ + if (pipe2(pipe_inject, O_NONBLOCK) == -1) { + log_warn("%s: injection pipe", __func__); + goto fail; + } + /* Initialize libevent so we can start wiring event handlers. */ event_init(); @@ -171,6 +194,12 @@ vionet_main(int fd, int fd_vmm) event_set(&ev_tap, vionet->data_fd, EV_READ | EV_PERSIST, vionet_rx_event, &dev); + /* Add an event for injected packets. */ + log_debug("%s: wiring in packet injection handler (fd=%d)", __func__, + pipe_inject[READ]); + event_set(&ev_inject, pipe_inject[READ], EV_READ | EV_PERSIST, + vionet_rx_event, &dev); + /* Configure our sync channel event handler. */ log_debug("%s: wiring in sync channel handler (fd=%d)", __func__, dev.sync_fd); @@ -209,6 +238,8 @@ vionet_main(int fd, int fd_vmm) close_fd(dev.sync_fd); close_fd(dev.async_fd); close_fd(vionet->data_fd); + close_fd(pipe_inject[READ]); + close_fd(pipe_inject[WRITE]); _exit(ret); /* NOTREACHED */ } @@ -223,6 +254,8 @@ fail: close_fd(dev.sync_fd); close_fd(dev.async_fd); + close_fd(pipe_inject[READ]); + close_fd(pipe_inject[WRITE]); if (vionet != NULL) close_fd(vionet->data_fd); @@ -232,7 +265,7 @@ fail: /* * Update the gpa and hva of the virtqueue. */ -void +static void vionet_update_qa(struct vionet_dev *dev) { struct virtio_vq_info *vq_info; @@ -259,7 +292,7 @@ vionet_update_qa(struct vionet_dev *dev) /* * Update the queue size. */ -void +static void vionet_update_qs(struct vionet_dev *dev) { struct virtio_vq_info *vq_info; @@ -280,33 +313,27 @@ vionet_update_qs(struct vionet_dev *dev) } /* - * vionet_enq_rx + * vionet_rx + * + * Pull packet from the provided fd and fill the receive-side virtqueue. We + * selectively use zero-copy approaches when possible. * - * Take a given packet from the host-side tap and copy it into the guest's - * buffers utilizing the rx virtio ring. If the packet length is invalid - * (too small or too large) or if there are not enough buffers available, - * the packet is dropped. + * Returns 1 if guest notification is needed. Otherwise, returns -1 on failure + * or 0 if no notification is needed. */ -int -vionet_enq_rx(struct vionet_dev *dev, char *pkt, size_t sz, int *spc) +static int +vionet_rx(struct vionet_dev *dev, int fd) { - uint16_t dxx, idx, hdr_desc_idx, chain_hdr_idx; + uint16_t idx, hdr_idx; char *vr = NULL; - size_t bufsz = 0, off = 0, pkt_offset = 0, chunk_size = 0; - size_t chain_len = 0; - struct vring_desc *desc, *pkt_desc, *hdr_desc; + size_t chain_len = 0, iov_cnt; + struct vring_desc *desc, *table; struct vring_avail *avail; struct vring_used *used; struct virtio_vq_info *vq_info; - struct virtio_net_hdr hdr; - size_t hdr_sz; - - if (sz < VIONET_MIN_TXLEN || sz > VIONET_MAX_TXLEN) { - log_warnx("%s: invalid packet size", __func__); - return (0); - } - - hdr_sz = sizeof(hdr); + struct iovec *iov; + int notify = 0; + ssize_t sz; if (!(dev->cfg.device_status & VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK)) { @@ -315,178 +342,287 @@ vionet_enq_rx(struct vionet_dev *dev, char *pkt, size_t sz, int *spc) } vq_info = &dev->vq[RXQ]; + idx = vq_info->last_avail; vr = vq_info->q_hva; if (vr == NULL) fatalx("%s: vr == NULL", __func__); - /* Compute offsets in ring of descriptors, avail ring, and used ring */ - desc = (struct vring_desc *)(vr); + table = (struct vring_desc *)(vr); avail = (struct vring_avail *)(vr + vq_info->vq_availoffset); used = (struct vring_used *)(vr + vq_info->vq_usedoffset); + used->flags |= VRING_USED_F_NO_NOTIFY; + + while (idx != avail->idx) { + hdr_idx = avail->ring[idx & VIONET_QUEUE_MASK]; + desc = &table[hdr_idx & VIONET_QUEUE_MASK]; + if (!DESC_WRITABLE(desc)) { + log_warnx("%s: invalid descriptor state", __func__); + goto reset; + } - idx = vq_info->last_avail & VIONET_QUEUE_MASK; - if ((vq_info->notified_avail & VIONET_QUEUE_MASK) == idx) { - log_debug("%s: insufficient available buffer capacity, " - "dropping packet.", __func__); - return (0); - } + iov = &iov_rx[0]; + iov_cnt = 1; - hdr_desc_idx = avail->ring[idx] & VIONET_QUEUE_MASK; - hdr_desc = &desc[hdr_desc_idx]; + /* + * First descriptor should be at least as large as the + * virtio_net_hdr. It's not technically required, but in + * legacy devices it should be safe to assume. + */ + iov->iov_len = desc->len; + if (iov->iov_len < sizeof(struct virtio_net_hdr)) { + log_warnx("%s: invalid descriptor length", __func__); + goto reset; + } - dxx = hdr_desc_idx; - chain_hdr_idx = dxx; - chain_len = 0; + /* + * Insert the virtio_net_hdr and adjust len/base. We do the + * pointer math here before it's a void*. + */ + iov->iov_base = hvaddr_mem(desc->addr, iov->iov_len); + if (iov->iov_base == NULL) + goto reset; + memset(iov->iov_base, 0, sizeof(struct virtio_net_hdr)); + + /* Tweak the iovec to account for the virtio_net_hdr. */ + iov->iov_len -= sizeof(struct virtio_net_hdr); + iov->iov_base = hvaddr_mem(desc->addr + + sizeof(struct virtio_net_hdr), iov->iov_len); + if (iov->iov_base == NULL) + goto reset; + chain_len = iov->iov_len; - /* Process the descriptor and walk any potential chain. */ - do { - off = 0; - pkt_desc = &desc[dxx]; - if (!(pkt_desc->flags & VRING_DESC_F_WRITE)) { - log_warnx("%s: invalid descriptor, not writable", + /* + * Walk the remaining chain and collect remaining addresses + * and lengths. + */ + while (desc->flags & VRING_DESC_F_NEXT) { + desc = &table[desc->next & VIONET_QUEUE_MASK]; + if (!DESC_WRITABLE(desc)) { + log_warnx("%s: invalid descriptor state", + __func__); + goto reset; + } + + /* Collect our IO information. Translate gpa's. */ + iov = &iov_rx[iov_cnt]; + iov->iov_len = desc->len; + iov->iov_base = hvaddr_mem(desc->addr, iov->iov_len); + if (iov->iov_base == NULL) + goto reset; + chain_len += iov->iov_len; + + /* Guard against infinitely looping chains. */ + if (++iov_cnt >= nitems(iov_rx)) { + log_warnx("%s: infinite chain detected", + __func__); + goto reset; + } + } + + /* Make sure the driver gave us the bare minimum buffers. */ + if (chain_len < VIONET_MIN_TXLEN) { + log_warnx("%s: insufficient buffers provided", __func__); - return (0); + goto reset; } - /* How much data do we get to write? */ - if (sz - bufsz > pkt_desc->len) - chunk_size = pkt_desc->len; + /* + * If we're enforcing hardware address or handling an injected + * packet, we need to use a copy-based approach. + */ + if (dev->lockedmac || fd != dev->data_fd) + sz = vionet_rx_copy(dev, fd, iov_rx, iov_cnt, + chain_len); else - chunk_size = sz - bufsz; + sz = vionet_rx_zerocopy(dev, fd, iov_rx, iov_cnt); + if (sz == -1) + goto reset; + if (sz == 0) /* No packets, so bail out for now. */ + break; + + /* + * Account for the prefixed header since it wasn't included + * in the copy or zerocopy operations. + */ + sz += sizeof(struct virtio_net_hdr); + + /* Mark our buffers as used. */ + used->ring[used->idx & VIONET_QUEUE_MASK].id = hdr_idx; + used->ring[used->idx & VIONET_QUEUE_MASK].len = sz; + __sync_synchronize(); + used->idx++; + idx++; + } + + if (idx != vq_info->last_avail && + !(avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) { + notify = 1; + dev->cfg.isr_status |= 1; + } - if (chain_len == 0) { - off = hdr_sz; - if (chunk_size == pkt_desc->len) - chunk_size -= off; + vq_info->last_avail = idx; + return (notify); +reset: + log_warnx("%s: requesting device reset", __func__); + dev->cfg.device_status |= DEVICE_NEEDS_RESET; + dev->cfg.isr_status |= VIRTIO_CONFIG_ISR_CONFIG_CHANGE; + return (1); +} + +/* + * vionet_rx_copy + * + * Read a packet off the provided file descriptor, validating packet + * characteristics, and copy into the provided buffers in the iovec array. + * + * It's assumed that the provided iovec array contains validated host virtual + * address translations and not guest physical addreses. + * + * Returns number of bytes copied on success, 0 if packet is dropped, and + * -1 on an error. + */ +ssize_t +vionet_rx_copy(struct vionet_dev *dev, int fd, const struct iovec *iov, + int iov_cnt, size_t chain_len) +{ + static uint8_t buf[VIONET_HARD_MTU]; + struct packet *pkt = NULL; + struct ether_header *eh = NULL; + uint8_t *payload = buf; + size_t i, chunk, nbytes, copied = 0; + ssize_t sz; + + /* If reading from the tap(4), try to right-size the read. */ + if (fd == dev->data_fd) + nbytes = MIN(chain_len, VIONET_HARD_MTU); + else if (fd == pipe_inject[READ]) + nbytes = sizeof(struct packet); + else { + log_warnx("%s: invalid fd: %d", __func__, fd); + return (-1); + } + + /* + * Try to pull a packet. The fd should be non-blocking and we don't + * care if we under-read (i.e. sz != nbytes) as we may not have a + * packet large enough to fill the buffer. + */ + sz = read(fd, buf, nbytes); + if (sz == -1) { + if (errno != EAGAIN) { + log_warn("%s: error reading packet", __func__); + return (-1); } + return (0); + } else if (fd == dev->data_fd && sz < VIONET_MIN_TXLEN) { + /* If reading the tap(4), we should get valid ethernet. */ + log_warnx("%s: invalid packet size", __func__); + return (0); + } else if (sz != sizeof(struct packet)) { + log_warnx("%s: invalid injected packet object", __func__); + return (0); + } - /* Write a chunk of data if we need to */ - if (chunk_size && write_mem(pkt_desc->addr + off, - pkt + pkt_offset, chunk_size)) { - log_warnx("%s: failed to write to buffer 0x%llx", - __func__, pkt_desc->addr); + /* Decompose an injected packet, if that's what we're working with. */ + if (fd == pipe_inject[READ]) { + pkt = (struct packet *)buf; + if (pkt->buf == NULL) { + log_warnx("%s: invalid injected packet, no buffer", + __func__); return (0); } + if (sz < VIONET_MIN_TXLEN || sz > VIONET_MAX_TXLEN) { + log_warnx("%s: invalid injected packet size", __func__); + goto drop; + } + payload = pkt->buf; + sz = (ssize_t)pkt->len; + } - chain_len += chunk_size + off; - bufsz += chunk_size; - pkt_offset += chunk_size; - - dxx = pkt_desc->next & VIONET_QUEUE_MASK; - } while (bufsz < sz && pkt_desc->flags & VRING_DESC_F_NEXT); + /* Validate the ethernet header, if required. */ + if (dev->lockedmac) { + eh = (struct ether_header *)(payload); + if (!ETHER_IS_MULTICAST(eh->ether_dhost) && + memcmp(eh->ether_dhost, dev->mac, + sizeof(eh->ether_dhost)) != 0) + goto drop; + } - /* Move our marker in the ring...*/ - vq_info->last_avail = (vq_info->last_avail + 1) & - VIONET_QUEUE_MASK; + /* Truncate one last time to the chain length, if shorter. */ + sz = MIN(chain_len, (size_t)sz); - /* Prepend the virtio net header in the first buffer. */ - memset(&hdr, 0, sizeof(hdr)); - hdr.hdr_len = hdr_sz; - if (write_mem(hdr_desc->addr, &hdr, hdr_sz)) { - log_warnx("vionet: rx enq header write_mem error @ 0x%llx", - hdr_desc->addr); - return (0); + /* + * Copy the packet into the provided buffers. We can use memcpy(3) + * here as the gpa was validated and translated to an hva previously. + */ + for (i = 0; (int)i < iov_cnt && (size_t)sz > copied; i++) { + chunk = MIN(iov[i].iov_len, (size_t)(sz - copied)); + memcpy(iov[i].iov_base, payload + copied, chunk); + copied += chunk; } - /* Update the index field in the used ring. This must be done last. */ - dev->cfg.isr_status = 1; - *spc = (vq_info->notified_avail - vq_info->last_avail) - & VIONET_QUEUE_MASK; - - /* Update the list of used buffers. */ - used->ring[used->idx & VIONET_QUEUE_MASK].id = chain_hdr_idx; - used->ring[used->idx & VIONET_QUEUE_MASK].len = chain_len; - __sync_synchronize(); - used->idx++; +drop: + /* Free any injected packet buffer. */ + if (pkt != NULL) + free(pkt->buf); - return (1); + return (copied); } /* - * vionet_rx + * vionet_rx_zerocopy + * + * Perform a vectorized read from the given fd into the guest physical memory + * pointed to by iovecs. + * + * Returns number of bytes read on success, -1 on error, or 0 if EAGAIN was + * returned by readv. * - * Enqueue data that was received on a tap file descriptor - * to the vionet device queue. */ -static int -vionet_rx(struct vionet_dev *dev) +static ssize_t +vionet_rx_zerocopy(struct vionet_dev *dev, int fd, const struct iovec *iov, + int iov_cnt) { - char buf[PAGE_SIZE]; - int num_enq = 0, spc = 0; - struct ether_header *eh; - ssize_t sz; + ssize_t sz; - do { - sz = read(dev->data_fd, buf, sizeof(buf)); - if (sz == -1) { - /* - * If we get EAGAIN, No data is currently available. - * Do not treat this as an error. - */ - if (errno != EAGAIN) - log_warn("%s: read error", __func__); - } else if (sz > 0) { - eh = (struct ether_header *)buf; - if (!dev->lockedmac || - ETHER_IS_MULTICAST(eh->ether_dhost) || - memcmp(eh->ether_dhost, dev->mac, - sizeof(eh->ether_dhost)) == 0) - num_enq += vionet_enq_rx(dev, buf, sz, &spc); - } else if (sz == 0) { - log_debug("%s: no data", __func__); - break; - } - } while (spc > 0 && sz > 0); + if (dev->lockedmac) { + log_warnx("%s: zerocopy not available for locked lladdr", + __func__); + return (-1); + } - return (num_enq); + sz = readv(fd, iov, iov_cnt); + if (sz == -1 && errno == EAGAIN) + return (0); + return (sz); } + /* * vionet_rx_event * * Called when new data can be received on the tap fd of a vionet device. */ static void -vionet_rx_event(int fd, short kind, void *arg) +vionet_rx_event(int fd, short event, void *arg) { struct virtio_dev *dev = (struct virtio_dev *)arg; - if (vionet_rx(&dev->vionet) > 0) - virtio_assert_pic_irq(dev, 0); -} + if (!(event & EV_READ)) + fatalx("%s: invalid event type", __func__); -void -vionet_notify_rx(struct virtio_dev *dev) -{ - struct vionet_dev *vionet = &dev->vionet; - struct vring_avail *avail; - struct virtio_vq_info *vq_info; - char *vr; - - vq_info = &vionet->vq[RXQ]; - vr = vq_info->q_hva; - if (vr == NULL) - fatalx("%s: vr == NULL", __func__); - - /* Compute offset into avail ring */ - avail = (struct vring_avail *)(vr + vq_info->vq_availoffset); - vq_info->notified_avail = avail->idx - 1; + if (vionet_rx(&dev->vionet, fd) > 0) + virtio_assert_pic_irq(dev, 0); } -int +static int vionet_notifyq(struct virtio_dev *dev) { struct vionet_dev *vionet = &dev->vionet; - int ret = 0; switch (vionet->cfg.queue_notify) { - case RXQ: - vionet_notify_rx(dev); - break; - case TXQ: - ret = vionet_notify_tx(dev); - break; + case TXQ: return vionet_notify_tx(dev); default: /* * Catch the unimplemented queue ID 2 (control queue) as @@ -497,23 +633,25 @@ vionet_notifyq(struct virtio_dev *dev) break; } - return (ret); + return (0); } -int +static int vionet_notify_tx(struct virtio_dev *dev) { - uint16_t idx, pkt_desc_idx, hdr_desc_idx, dxx, cnt; - size_t pktsz, chunk_size = 0; - ssize_t dhcpsz = 0; - int ofs, spc = 0; - char *vr = NULL, *pkt = NULL, *dhcppkt = NULL; + uint16_t idx, hdr_idx; + size_t chain_len, iov_cnt; + ssize_t dhcpsz = 0, sz; + int notify = 0; + char *vr = NULL, *dhcppkt = NULL; struct vionet_dev *vionet = &dev->vionet; - struct vring_desc *desc, *pkt_desc, *hdr_desc; + struct vring_desc *desc, *table; struct vring_avail *avail; struct vring_used *used; struct virtio_vq_info *vq_info; struct ether_header *eh; + struct iovec *iov; + struct packet pkt; if (!(vionet->cfg.device_status & VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK)) { @@ -522,155 +660,161 @@ vionet_notify_tx(struct virtio_dev *dev) } vq_info = &vionet->vq[TXQ]; + idx = vq_info->last_avail; vr = vq_info->q_hva; if (vr == NULL) fatalx("%s: vr == NULL", __func__); /* Compute offsets in ring of descriptors, avail ring, and used ring */ - desc = (struct vring_desc *)(vr); + table = (struct vring_desc *)(vr); avail = (struct vring_avail *)(vr + vq_info->vq_availoffset); used = (struct vring_used *)(vr + vq_info->vq_usedoffset); - idx = vq_info->last_avail & VIONET_QUEUE_MASK; - - if ((avail->idx & VIONET_QUEUE_MASK) == idx) - return (0); - - while ((avail->idx & VIONET_QUEUE_MASK) != idx) { - hdr_desc_idx = avail->ring[idx] & VIONET_QUEUE_MASK; - hdr_desc = &desc[hdr_desc_idx]; - pktsz = 0; - - cnt = 0; - dxx = hdr_desc_idx; - do { - pktsz += desc[dxx].len; - dxx = desc[dxx].next & VIONET_QUEUE_MASK; - - /* - * Virtio 1.0, cs04, section 2.4.5: - * "The number of descriptors in the table is defined - * by the queue size for this virtqueue: this is the - * maximum possible descriptor chain length." - */ - if (++cnt >= VIONET_QUEUE_SIZE) { - log_warnx("%s: descriptor table invalid", - __func__); - goto out; - } - } while (desc[dxx].flags & VRING_DESC_F_NEXT); - - pktsz += desc[dxx].len; + while (idx != avail->idx) { + hdr_idx = avail->ring[idx & VIONET_QUEUE_MASK]; + desc = &table[hdr_idx & VIONET_QUEUE_MASK]; + if (DESC_WRITABLE(desc)) { + log_warnx("%s: invalid descriptor state", __func__); + goto reset; + } - /* Remove virtio header descriptor len */ - pktsz -= hdr_desc->len; + iov = &iov_tx[0]; + iov_cnt = 0; + chain_len = 0; - /* Drop packets violating device MTU-based limits */ - if (pktsz < VIONET_MIN_TXLEN || pktsz > VIONET_MAX_TXLEN) { - log_warnx("%s: invalid packet size %lu", __func__, - pktsz); - goto drop_packet; - } - pkt = malloc(pktsz); - if (pkt == NULL) { - log_warn("malloc error alloc packet buf"); - goto out; + /* + * As a legacy device, we most likely will receive a lead + * descriptor sized to the virtio_net_hdr. However, the framing + * is not guaranteed, so check for packet data. + */ + iov->iov_len = desc->len; + if (iov->iov_len < sizeof(struct virtio_net_hdr)) { + log_warnx("%s: invalid descriptor length", __func__); + goto reset; + } else if (iov->iov_len > sizeof(struct virtio_net_hdr)) { + /* Chop off the virtio header, leaving packet data. */ + iov->iov_len -= sizeof(struct virtio_net_hdr); + chain_len += iov->iov_len; + iov->iov_base = hvaddr_mem(desc->addr + + sizeof(struct virtio_net_hdr), iov->iov_len); + if (iov->iov_base == NULL) + goto reset; + iov_cnt++; } - ofs = 0; - pkt_desc_idx = hdr_desc->next & VIONET_QUEUE_MASK; - pkt_desc = &desc[pkt_desc_idx]; - - while (pkt_desc->flags & VRING_DESC_F_NEXT) { - /* must be not writable */ - if (pkt_desc->flags & VRING_DESC_F_WRITE) { - log_warnx("unexpected writable tx desc " - "%d", pkt_desc_idx); - goto out; + /* + * Walk the chain and collect remaining addresses and lengths. + */ + while (desc->flags & VRING_DESC_F_NEXT) { + desc = &table[desc->next & VIONET_QUEUE_MASK]; + if (DESC_WRITABLE(desc)) { + log_warnx("%s: invalid descriptor state", + __func__); + goto reset; } - /* Check we don't read beyond allocated pktsz */ - if (pkt_desc->len > pktsz - ofs) { - log_warnx("%s: descriptor len past pkt len", + /* Collect our IO information, translating gpa's. */ + iov = &iov_tx[iov_cnt]; + iov->iov_len = desc->len; + iov->iov_base = hvaddr_mem(desc->addr, iov->iov_len); + if (iov->iov_base == NULL) + goto reset; + chain_len += iov->iov_len; + + /* Guard against infinitely looping chains. */ + if (++iov_cnt >= nitems(iov_tx)) { + log_warnx("%s: infinite chain detected", __func__); - chunk_size = pktsz - ofs; - } else - chunk_size = pkt_desc->len; - - /* Read packet from descriptor ring */ - if (read_mem(pkt_desc->addr, pkt + ofs, chunk_size)) { - log_warnx("vionet: packet read_mem error " - "@ 0x%llx", pkt_desc->addr); - goto out; + goto reset; } - - ofs += pkt_desc->len; - pkt_desc_idx = pkt_desc->next & VIONET_QUEUE_MASK; - pkt_desc = &desc[pkt_desc_idx]; } - /* Now handle tail descriptor - must be not writable */ - if (pkt_desc->flags & VRING_DESC_F_WRITE) { - log_warnx("unexpected writable tx descriptor %d", - pkt_desc_idx); - goto out; + /* Check if we've got a minimum viable amount of data. */ + if (chain_len < VIONET_MIN_TXLEN) { + sz = chain_len; + goto drop; } - /* Check we don't read beyond allocated pktsz */ - if (pkt_desc->len > pktsz - ofs) { - log_warnx("%s: descriptor len past pkt len", __func__); - chunk_size = pktsz - ofs - pkt_desc->len; - } else - chunk_size = pkt_desc->len; - - /* Read packet from descriptor ring */ - if (read_mem(pkt_desc->addr, pkt + ofs, chunk_size)) { - log_warnx("vionet: packet read_mem error @ " - "0x%llx", pkt_desc->addr); - goto out; + /* + * Packet inspection for ethernet header (if using a "local" + * interface) for possibility of a DHCP packet or (if using + * locked lladdr) for validating ethernet header. + * + * To help preserve zero-copy semantics, we require the first + * descriptor with packet data contains a large enough buffer + * for this inspection. + */ + iov = &iov_tx[0]; + if (vionet->lockedmac) { + if (iov->iov_len < ETHER_HDR_LEN) { + log_warnx("%s: insufficient header data", + __func__); + goto drop; + } + eh = (struct ether_header *)iov->iov_base; + if (memcmp(eh->ether_shost, vionet->mac, + sizeof(eh->ether_shost)) != 0) { + log_warnx("%s: bad source address %s", + __func__, ether_ntoa((struct ether_addr *) + eh->ether_shost)); + sz = chain_len; + goto drop; + } } - - /* reject other source addresses */ - if (vionet->lockedmac && pktsz >= ETHER_HDR_LEN && - (eh = (struct ether_header *)pkt) && - memcmp(eh->ether_shost, vionet->mac, - sizeof(eh->ether_shost)) != 0) - log_debug("vionet: wrong source address %s for vm %d", - ether_ntoa((struct ether_addr *) - eh->ether_shost), dev->vm_id); - else if (vionet->local && - (dhcpsz = dhcp_request(dev, pkt, pktsz, &dhcppkt)) != -1) { - log_debug("vionet: dhcp request," - " local response size %zd", dhcpsz); - - /* XXX signed vs unsigned here, funky cast */ - } else if (write(vionet->data_fd, pkt, pktsz) != (int)pktsz) { - log_warnx("vionet: tx failed writing to tap: " - "%d", errno); - goto out; + if (vionet->local) { + dhcpsz = dhcp_request(dev, iov->iov_base, iov->iov_len, + &dhcppkt); + log_debug("%s: detected dhcp request of %zu bytes", + __func__, dhcpsz); } - drop_packet: - vionet->cfg.isr_status = 1; - used->ring[used->idx & VIONET_QUEUE_MASK].id = hdr_desc_idx; - used->ring[used->idx & VIONET_QUEUE_MASK].len = hdr_desc->len; + /* Write our packet to the tap(4). */ + sz = writev(vionet->data_fd, iov_tx, iov_cnt); + if (sz == -1 && errno != ENOBUFS) { + log_warn("%s", __func__); + goto reset; + } + sz += sizeof(struct virtio_net_hdr); +drop: + used->ring[used->idx & VIONET_QUEUE_MASK].id = hdr_idx; + used->ring[used->idx & VIONET_QUEUE_MASK].len = sz; __sync_synchronize(); used->idx++; - - vq_info->last_avail = avail->idx & VIONET_QUEUE_MASK; - idx = (idx + 1) & VIONET_QUEUE_MASK; - - free(pkt); - pkt = NULL; + idx++; + + /* Facilitate DHCP reply injection, if needed. */ + if (dhcpsz > 0) { + pkt.buf = dhcppkt; + pkt.len = dhcpsz; + sz = write(pipe_inject[WRITE], &pkt, sizeof(pkt)); + if (sz == -1 && errno != EAGAIN) { + log_warn("%s: packet injection", __func__); + free(pkt.buf); + } else if (sz == -1 && errno == EAGAIN) { + log_debug("%s: dropping dhcp reply", __func__); + free(pkt.buf); + } else if (sz != sizeof(pkt)) { + log_warnx("%s: failed packet injection", + __func__); + free(pkt.buf); + } + log_debug("%s: injected dhcp reply with %ld bytes", + __func__, sz); + } } - if (dhcpsz > 0) - vionet_enq_rx(vionet, dhcppkt, dhcpsz, &spc); - -out: - free(pkt); - free(dhcppkt); + if (idx != vq_info->last_avail && + !(avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) { + notify = 1; + vionet->cfg.isr_status |= 1; + } + vq_info->last_avail = idx; + return (notify); +reset: + log_warnx("%s: requesting device reset", __func__); + vionet->cfg.device_status |= DEVICE_NEEDS_RESET; + vionet->cfg.isr_status |= VIRTIO_CONFIG_ISR_CONFIG_CHANGE; return (1); } @@ -728,12 +872,15 @@ dev_dispatch_vm(int fd, short event, void *arg) case IMSG_VMDOP_PAUSE_VM: log_debug("%s: pausing", __func__); event_del(&ev_tap); + event_del(&ev_inject); break; case IMSG_VMDOP_UNPAUSE_VM: log_debug("%s: unpausing", __func__); if (vionet->cfg.device_status - & VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK) + & VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK) { event_add(&ev_tap, NULL); + event_add(&ev_inject, NULL); + } break; case IMSG_CTL_VERBOSE: IMSG_SIZE_CHECK(&imsg, &verbose); @@ -821,6 +968,7 @@ handle_sync_io(int fd, short event, void *arg) case VIODEV_MSG_SHUTDOWN: event_del(&dev->sync_iev.ev); event_del(&ev_tap); + event_del(&ev_inject); event_loopbreak(); return; default: @@ -881,11 +1029,11 @@ handle_io_write(struct viodev_msg *msg, struct virtio_dev *dev) virtio_deassert_pic_irq(dev, msg->vcpu); } event_del(&ev_tap); + event_del(&ev_inject); if (vionet->cfg.device_status & VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK) { - if (event_add(&ev_tap, NULL)) - log_warn("%s: could not initialize virtio tap " - "event handler", __func__); + event_add(&ev_tap, NULL); + event_add(&ev_inject, NULL); } break; default: |