diff options
author | David Gwynne <dlg@cvs.openbsd.org> | 2024-11-17 00:25:08 +0000 |
---|---|---|
committer | David Gwynne <dlg@cvs.openbsd.org> | 2024-11-17 00:25:08 +0000 |
commit | ce8a47fdbfec7dfa9d85b883396e1085bfc86fd5 (patch) | |
tree | 1aeba7c580c63658d4c1b5b517431bc7b90ba248 | |
parent | 6ddc2616716ff77eec54b3a9c5cbe497bc6ee4a6 (diff) |
provide network offloads between the kernel and userland again
userland can request that network packets that are read from or
written to the device special file get prepended with a "tun_hdr"
struct. this struct contains bits which say what offloads are
requested for the packet, including things like ip/tcp/udp/icmp
checksums, tcp segmentation offloads, or ethernet vlan tags.
userland can write a packet with any of these offloads requested
into the kernel at any time, but has to request which ones it's
able to handle coming from the kernel. enabling the tun_hdr struct
and which offloads userland can handle is done with a new TUNSCAP
ioctl.
this is based on the virtio_net_hdr in linux, which jan@ actually
implemented and had working with vmd. however, claudio@ and i
strongly opposed to what feels like a layer violation by pulling
virtio structures into the tun driver, and then trying to emulate
virtio/linux semantics in our network stack, and playing catch up
when the "upstream" projects decide to change the shape or meaning
of these bits. tun_hdr is specific to the openbsd network stack and
it's semantics, which simplifies our kernel implementation. jan has
been pretty gracious about the extra work on the vmd side of things.
tested by and ok jan@
ok claudio@
sthen@ backed this out cos of confusion with the ioctl numbers i
picked to controlling this feature. i've picked new numbers that
don't conflict this time.
-rw-r--r-- | sys/net/if_tun.c | 166 | ||||
-rw-r--r-- | sys/net/if_tun.h | 53 |
2 files changed, 213 insertions, 6 deletions
diff --git a/sys/net/if_tun.c b/sys/net/if_tun.c index 432d2b6154e..a1a84104066 100644 --- a/sys/net/if_tun.c +++ b/sys/net/if_tun.c @@ -1,4 +1,4 @@ -/* $OpenBSD: if_tun.c,v 1.246 2024/11/14 13:47:38 sthen Exp $ */ +/* $OpenBSD: if_tun.c,v 1.247 2024/11/17 00:25:07 dlg Exp $ */ /* $NetBSD: if_tun.c,v 1.24 1996/05/07 02:40:48 thorpej Exp $ */ /* @@ -88,6 +88,7 @@ struct tun_softc { struct sigio_ref sc_sigio; /* async I/O registration */ unsigned int sc_flags; /* misc flags */ #define TUN_DEAD (1 << 16) +#define TUN_HDR (1 << 17) dev_t sc_dev; struct refcnt sc_refs; @@ -104,6 +105,13 @@ int tundebug = TUN_DEBUG; /* Pretend that these IFF flags are changeable by TUNSIFINFO */ #define TUN_IFF_FLAGS (IFF_POINTOPOINT|IFF_MULTICAST|IFF_BROADCAST) +#define TUN_IF_CAPS ( \ + IFCAP_CSUM_IPv4 | \ + IFCAP_CSUM_TCPv4|IFCAP_CSUM_UDPv4|IFCAP_CSUM_TCPv6|IFCAP_CSUM_UDPv6 | \ + IFCAP_VLAN_MTU|IFCAP_VLAN_HWTAGGING|IFCAP_VLAN_HWOFFLOAD | \ + IFCAP_TSOv4|IFCAP_TSOv6|IFCAP_LRO \ +) + void tunattach(int); int tun_dev_open(dev_t, const struct if_clone *, int, struct proc *); @@ -496,10 +504,11 @@ tun_dev_close(dev_t dev, struct proc *p) */ NET_LOCK(); CLR(ifp->if_flags, IFF_UP | IFF_RUNNING); + CLR(ifp->if_capabilities, TUN_IF_CAPS); NET_UNLOCK(); ifq_purge(&ifp->if_snd); - CLR(sc->sc_flags, TUN_ASYNC); + CLR(sc->sc_flags, TUN_ASYNC|TUN_HDR); sigio_free(&sc->sc_sigio); if (!ISSET(sc->sc_flags, TUN_DEAD)) { @@ -627,6 +636,51 @@ tapioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct proc *p) return (tun_dev_ioctl(dev, cmd, data)); } +static int +tun_set_capabilities(struct tun_softc *sc, const struct tun_capabilities *cap) +{ + if (ISSET(cap->tun_if_capabilities, ~TUN_IF_CAPS)) + return (EINVAL); + + KERNEL_ASSERT_LOCKED(); + SET(sc->sc_flags, TUN_HDR); + + NET_LOCK(); + CLR(sc->sc_if.if_capabilities, TUN_IF_CAPS); + SET(sc->sc_if.if_capabilities, cap->tun_if_capabilities); + NET_UNLOCK(); + return (0); +} + +static int +tun_get_capabilities(struct tun_softc *sc, struct tun_capabilities *cap) +{ + int error = 0; + + NET_LOCK_SHARED(); + if (ISSET(sc->sc_flags, TUN_HDR)) { + cap->tun_if_capabilities = + (sc->sc_if.if_capabilities & TUN_IF_CAPS); + } else + error = ENODEV; + NET_UNLOCK_SHARED(); + + return (error); +} + +static int +tun_del_capabilities(struct tun_softc *sc) +{ + NET_LOCK(); + CLR(sc->sc_if.if_capabilities, TUN_IF_CAPS); + NET_UNLOCK(); + + KERNEL_ASSERT_LOCKED(); + CLR(sc->sc_flags, TUN_HDR); + + return (0); +} + int tun_dev_ioctl(dev_t dev, u_long cmd, void *data) { @@ -678,6 +732,18 @@ tun_dev_ioctl(dev_t dev, u_long cmd, void *data) } break; + case TUNSCAP: + error = tun_set_capabilities(sc, + (const struct tun_capabilities *)data); + break; + case TUNGCAP: + error = tun_get_capabilities(sc, + (struct tun_capabilities *)data); + break; + case TUNDCAP: + error = tun_del_capabilities(sc); + break; + case FIONBIO: break; case FIOASYNC: @@ -745,6 +811,7 @@ tun_dev_read(dev_t dev, struct uio *uio, int ioflag) struct tun_softc *sc; struct ifnet *ifp; struct mbuf *m, *m0; + size_t len; int error = 0; sc = tun_get(dev); @@ -763,9 +830,46 @@ tun_dev_read(dev_t dev, struct uio *uio, int ioflag) bpf_mtap(ifp->if_bpf, m0, BPF_DIRECTION_OUT); #endif + if (ISSET(sc->sc_flags, TUN_HDR)) { + struct tun_hdr th; + + KASSERT(ISSET(m0->m_flags, M_PKTHDR)); + + th.th_flags = 0; + if (ISSET(m0->m_pkthdr.csum_flags, M_IPV4_CSUM_OUT)) + SET(th.th_flags, TUN_H_IPV4_CSUM); + if (ISSET(m0->m_pkthdr.csum_flags, M_TCP_CSUM_OUT)) + SET(th.th_flags, TUN_H_TCP_CSUM); + if (ISSET(m0->m_pkthdr.csum_flags, M_UDP_CSUM_OUT)) + SET(th.th_flags, TUN_H_UDP_CSUM); + if (ISSET(m0->m_pkthdr.csum_flags, M_ICMP_CSUM_OUT)) + SET(th.th_flags, TUN_H_ICMP_CSUM); + + th.th_pad = 0; + + th.th_vtag = 0; + if (ISSET(m0->m_flags, M_VLANTAG)) { + SET(th.th_flags, TUN_H_VTAG); + th.th_vtag = m0->m_pkthdr.ether_vtag; + } + + th.th_mss = 0; + if (ISSET(m0->m_pkthdr.csum_flags, M_TCP_TSO)) { + SET(th.th_flags, TUN_H_TCP_MSS); + th.th_mss = m0->m_pkthdr.ph_mss; + } + + len = ulmin(uio->uio_resid, sizeof(th)); + if (len > 0) { + error = uiomove(&th, len, uio); + if (error != 0) + goto free; + } + } + m = m0; while (uio->uio_resid > 0) { - size_t len = ulmin(uio->uio_resid, m->m_len); + len = ulmin(uio->uio_resid, m->m_len); if (len > 0) { error = uiomove(mtod(m, void *), len, uio); if (error != 0) @@ -777,6 +881,7 @@ tun_dev_read(dev_t dev, struct uio *uio, int ioflag) break; } +free: m_freem(m0); put: @@ -807,6 +912,8 @@ tun_dev_write(dev_t dev, struct uio *uio, int ioflag, int align) struct mbuf *m0; int error = 0; size_t mlen; + size_t hlen; + struct tun_hdr th; sc = tun_get(dev); if (sc == NULL) @@ -814,8 +921,11 @@ tun_dev_write(dev_t dev, struct uio *uio, int ioflag, int align) ifp = &sc->sc_if; - if (uio->uio_resid < ifp->if_hdrlen || - uio->uio_resid > (ifp->if_hdrlen + ifp->if_hardmtu)) { + hlen = ifp->if_hdrlen; + if (ISSET(sc->sc_flags, TUN_HDR)) + hlen += sizeof(th); + if (uio->uio_resid < hlen || + uio->uio_resid > (hlen + ifp->if_hardmtu)) { error = EMSGSIZE; goto put; } @@ -840,6 +950,52 @@ tun_dev_write(dev_t dev, struct uio *uio, int ioflag, int align) m0->m_pkthdr.len = m0->m_len = mlen; m_adj(m0, align); + if (ISSET(sc->sc_flags, TUN_HDR)) { + error = uiomove(&th, sizeof(th), uio); + if (error != 0) + goto drop; + + if (ISSET(th.th_flags, TUN_H_IPV4_CSUM)) { + SET(m0->m_pkthdr.csum_flags, + M_IPV4_CSUM_OUT | M_IPV4_CSUM_IN_OK); + } + + switch (th.th_flags & + (TUN_H_TCP_CSUM|TUN_H_UDP_CSUM|TUN_H_ICMP_CSUM)) { + case 0: + break; + case TUN_H_TCP_CSUM: + SET(m0->m_pkthdr.csum_flags, + M_TCP_CSUM_OUT | M_TCP_CSUM_IN_OK); + break; + case TUN_H_UDP_CSUM: + SET(m0->m_pkthdr.csum_flags, + M_UDP_CSUM_OUT | M_UDP_CSUM_IN_OK); + break; + case TUN_H_ICMP_CSUM: + SET(m0->m_pkthdr.csum_flags, + M_ICMP_CSUM_OUT | M_ICMP_CSUM_IN_OK); + break; + default: + error = EINVAL; + goto drop; + } + + if (ISSET(th.th_flags, TUN_H_VTAG)) { + if (!ISSET(sc->sc_flags, TUN_LAYER2)) { + error = EINVAL; + goto drop; + } + SET(m0->m_flags, M_VLANTAG); + m0->m_pkthdr.ether_vtag = th.th_vtag; + } + + if (ISSET(th.th_flags, TUN_H_TCP_MSS)) { + SET(m0->m_pkthdr.csum_flags, M_TCP_TSO); + m0->m_pkthdr.ph_mss = th.th_mss; + } + } + error = uiomove(mtod(m0, void *), m0->m_len, uio); if (error != 0) goto drop; diff --git a/sys/net/if_tun.h b/sys/net/if_tun.h index 3e17f0d10b1..fb40345744e 100644 --- a/sys/net/if_tun.h +++ b/sys/net/if_tun.h @@ -1,4 +1,4 @@ -/* $OpenBSD: if_tun.h,v 1.17 2024/11/14 13:47:38 sthen Exp $ */ +/* $OpenBSD: if_tun.h,v 1.18 2024/11/17 00:25:07 dlg Exp $ */ /* * Copyright (c) 1988, Julian Onions <Julian.Onions@nexor.co.uk> @@ -31,6 +31,49 @@ #include <sys/ioccom.h> +/* + * tun_hdr is a multiple of 4 bytes, but is built out of uint16_t + * fields. This allows it to sit on a 2 byte boundary in front of + * either IP (and MPLS) or Ethernet packets for tun(4) and tap(4) + * interfaces respectively while maintaining the alignment of their + * payloads. + * + * Userland can request the use of the tun_hdr using the TUNSCAP + * ioctl. This ioctl also allows userland to specify which "offload" + * capabilities it is able to accept in packets it will read from the + * kernel. It is acceptable to enable tun_hdr without enabling any + * interface offload capabilities. + * + * Once the tap_hdr is enabled, userland can write packets into the + * kernel with any of the supported features. tun(4)/tap(4) reads + * will unconditionally handle any features specified on the packet, + * regardless of what capabilities were specified by the TUNSCAP + * ioctl. + * + * The tun_hdr can be read from one interface and written directly + * to another without interpretation or modification. + * + * Use of tun_hdr and the associated capabilities are reset when a + * tun(4)/tap(4) device is closed. + */ + +struct tun_hdr { + uint16_t th_flags; +#define TUN_H_PRIO_MASK 0x7 +#define TUN_H_VTAG (1 << 4) /* th_vtag is set */ +#define TUN_H_TCP_MSS (1 << 5) /* Cut TCP frame up by th_mss */ + +#define TUN_H_IPV4_CSUM (1 << 8) +#define TUN_H_TCP_CSUM (1 << 9) +#define TUN_H_UDP_CSUM (1 << 10) +#define TUN_H_ICMP_CSUM (1 << 11) + + uint16_t th_pad; + + uint16_t th_vtag; + uint16_t th_mss; +}; + #define TUN_OPEN 0x0001 #define TUN_INITED 0x0002 #define TUN_RCOLL 0x0004 /* unused */ @@ -68,4 +111,12 @@ struct tuninfo { #define TUNSDEBUG _IOW('t', 94, int) #define TUNGDEBUG _IOR('t', 95, int) +struct tun_capabilities { + uint32_t tun_if_capabilities; /* IFCAP_* from net/if.h */ +}; + +#define TUNSCAP _IOW('t', 196, struct tun_capabilities) +#define TUNGCAP _IOR('t', 196, struct tun_capabilities) +#define TUNDCAP _IO('t', 196) + #endif /* _NET_IF_TUN_H_ */ |