summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--sys/conf/GENERIC3
-rw-r--r--sys/kern/uipc_socket.c361
-rw-r--r--sys/netinet/in_proto.c4
-rw-r--r--sys/netinet/tcp_input.c36
-rw-r--r--sys/netinet/tcp_output.c8
-rw-r--r--sys/netinet/tcp_var.h4
-rw-r--r--sys/netinet6/in6_proto.c4
-rw-r--r--sys/sys/protosw.h3
-rw-r--r--sys/sys/socket.h11
-rw-r--r--sys/sys/socketvar.h28
10 files changed, 434 insertions, 28 deletions
diff --git a/sys/conf/GENERIC b/sys/conf/GENERIC
index d41397419f4..2868aa33aaa 100644
--- a/sys/conf/GENERIC
+++ b/sys/conf/GENERIC
@@ -1,4 +1,4 @@
-# $OpenBSD: GENERIC,v 1.168 2010/10/18 15:01:21 claudio Exp $
+# $OpenBSD: GENERIC,v 1.169 2011/01/07 17:50:42 bluhm Exp $
#
# Machine-independent option; used by all architectures for their
# GENERIC kernel
@@ -42,6 +42,7 @@ option EXT2FS # Second Extended Filesystem
option MFS # memory file system
option NNPFS # NNPFS filesystem
+option SOCKET_SPLICE # Socket Splicing for TCP
option TCP_SACK # Selective Acknowledgements for TCP
option TCP_ECN # Explicit Congestion Notification for TCP
option TCP_SIGNATURE # TCP MD5 Signatures, for BGP routing sessions
diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c
index 53648922f37..115bb69a8d1 100644
--- a/sys/kern/uipc_socket.c
+++ b/sys/kern/uipc_socket.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: uipc_socket.c,v 1.84 2010/09/24 02:59:45 claudio Exp $ */
+/* $OpenBSD: uipc_socket.c,v 1.85 2011/01/07 17:50:42 bluhm Exp $ */
/* $NetBSD: uipc_socket.c,v 1.21 1996/02/04 02:17:52 christos Exp $ */
/*
@@ -36,6 +36,7 @@
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/file.h>
+#include <sys/filedesc.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/domain.h>
@@ -50,6 +51,8 @@
#include <net/route.h>
#include <sys/pool.h>
+int sosplice(struct socket *, int, off_t);
+int somove(struct socket *, int);
void filt_sordetach(struct knote *kn);
int filt_soread(struct knote *kn, long hint);
void filt_sowdetach(struct knote *kn);
@@ -144,8 +147,13 @@ sobind(struct socket *so, struct mbuf *nam, struct proc *p)
int
solisten(struct socket *so, int backlog)
{
- int s = splsoftnet(), error;
+ int s, error;
+#ifdef SOCKET_SPLICE
+ if (so->so_splice || so->so_spliceback)
+ return (EOPNOTSUPP);
+#endif /* SOCKET_SPLICE */
+ s = splsoftnet();
error = (*so->so_proto->pr_usrreq)(so, PRU_LISTEN, NULL, NULL, NULL,
curproc);
if (error) {
@@ -183,6 +191,21 @@ sofree(struct socket *so)
if (!soqremque(so, 0))
return;
}
+#ifdef SOCKET_SPLICE
+ if (so->so_spliceback) {
+ so->so_snd.sb_flags &= ~SB_SPLICE;
+ so->so_spliceback->so_rcv.sb_flags &= ~SB_SPLICE;
+ so->so_spliceback->so_splice = NULL;
+ if (soreadable(so->so_spliceback))
+ sorwakeup(so->so_spliceback);
+ }
+ if (so->so_splice) {
+ so->so_splice->so_snd.sb_flags &= ~SB_SPLICE;
+ so->so_rcv.sb_flags &= ~SB_SPLICE;
+ so->so_splice->so_spliceback = NULL;
+ }
+ so->so_spliceback = so->so_splice = NULL;
+#endif /* SOCKET_SPLICE */
sbrelease(&so->so_snd);
sorflush(so);
pool_put(&socket_pool, so);
@@ -967,6 +990,311 @@ sorflush(struct socket *so)
sbrelease(&asb);
}
+#ifdef SOCKET_SPLICE
+int
+sosplice(struct socket *so, int fd, off_t max)
+{
+ struct file *fp;
+ struct socket *sosp;
+ int s, error = 0;
+
+ if ((so->so_proto->pr_flags & PR_SPLICE) == 0)
+ return (EPROTONOSUPPORT);
+ if (so->so_options & SO_ACCEPTCONN)
+ return (EOPNOTSUPP);
+ if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0)
+ return (ENOTCONN);
+
+ /* If no fd is given, unsplice by removing existing link. */
+ if (fd < 0) {
+ s = splsoftnet();
+ if (so->so_splice) {
+ so->so_splice->so_snd.sb_flags &= ~SB_SPLICE;
+ so->so_rcv.sb_flags &= ~SB_SPLICE;
+ so->so_splice->so_spliceback = NULL;
+ so->so_splice = NULL;
+ if (soreadable(so))
+ sorwakeup(so);
+ }
+ splx(s);
+ return (0);
+ }
+
+ if (max && max < 0)
+ return (EINVAL);
+
+ /* Find sosp, the drain socket where data will be spliced into. */
+ if ((error = getsock(curproc->p_fd, fd, &fp)) != 0)
+ return (error);
+ sosp = fp->f_data;
+
+ /* Lock both receive and send buffer. */
+ if ((error = sblock(&so->so_rcv,
+ (so->so_state & SS_NBIO) ? M_NOWAIT : M_WAITOK)) != 0) {
+ FRELE(fp);
+ return (error);
+ }
+ if ((error = sblock(&sosp->so_snd, M_WAITOK)) != 0) {
+ sbunlock(&so->so_rcv);
+ FRELE(fp);
+ return (error);
+ }
+ s = splsoftnet();
+
+ if (so->so_splice || sosp->so_spliceback) {
+ error = EBUSY;
+ goto release;
+ }
+ if (sosp->so_proto->pr_usrreq != so->so_proto->pr_usrreq) {
+ error = EPROTONOSUPPORT;
+ goto release;
+ }
+ if (sosp->so_options & SO_ACCEPTCONN) {
+ error = EOPNOTSUPP;
+ goto release;
+ }
+ if ((sosp->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0) {
+ error = ENOTCONN;
+ goto release;
+ }
+
+ /* Splice so and sosp together. */
+ so->so_splice = sosp;
+ sosp->so_spliceback = so;
+ so->so_splicelen = 0;
+ so->so_splicemax = max;
+
+ /*
+ * To prevent softnet interrupt from calling somove() while
+ * we sleep, the socket buffers are not marked as spliced yet.
+ */
+ if (somove(so, M_WAIT)) {
+ so->so_rcv.sb_flags |= SB_SPLICE;
+ sosp->so_snd.sb_flags |= SB_SPLICE;
+ }
+
+ release:
+ splx(s);
+ sbunlock(&sosp->so_snd);
+ sbunlock(&so->so_rcv);
+ FRELE(fp);
+ return (error);
+}
+
+/*
+ * Move data from receive buffer of spliced source socket to send
+ * buffer of drain socket. Try to move as much as possible in one
+ * big chunk. It is a TCP only implementation.
+ * Return value 0 means splicing has been finished, 1 continue.
+ */
+int
+somove(struct socket *so, int wait)
+{
+ struct socket *sosp = so->so_splice;
+ struct mbuf *m = NULL, **mp;
+ u_long len, off, oobmark;
+ long space;
+ int error = 0, maxreached = 0;
+ short state;
+
+ splsoftassert(IPL_SOFTNET);
+
+ if (so->so_error) {
+ error = so->so_error;
+ goto release;
+ }
+ if (sosp->so_state & SS_CANTSENDMORE) {
+ error = EPIPE;
+ goto release;
+ }
+ if (sosp->so_error) {
+ error = sosp->so_error;
+ goto release;
+ }
+ if ((sosp->so_state & SS_ISCONNECTED) == 0)
+ goto release;
+
+ /* Calculate how many bytes can be copied now. */
+ len = so->so_rcv.sb_cc;
+ if (len == 0)
+ goto release;
+ if (so->so_splicemax) {
+ KASSERT(so->so_splicelen < so->so_splicemax);
+ if (so->so_splicemax <= so->so_splicelen + len) {
+ len = so->so_splicemax - so->so_splicelen;
+ maxreached = 1;
+ }
+ }
+ space = sbspace(&sosp->so_snd);
+ if (so->so_oobmark && so->so_oobmark < len &&
+ so->so_oobmark < space + 1024)
+ space += 1024;
+ if (space <= 0) {
+ maxreached = 0;
+ goto release;
+ }
+ if (space < len) {
+ maxreached = 0;
+ if (space < sosp->so_snd.sb_lowat)
+ goto release;
+ len = space;
+ }
+ sosp->so_state |= SS_ISSENDING;
+
+ /* Take at most len mbufs out of receive buffer. */
+ m = so->so_rcv.sb_mb;
+ for (off = 0, mp = &m; off < len;
+ off += (*mp)->m_len, mp = &(*mp)->m_next) {
+ u_long size = len - off;
+
+ if ((*mp)->m_len > size) {
+ if (!maxreached || (*mp = m_copym(
+ so->so_rcv.sb_mb, 0, size, wait)) == NULL) {
+ len -= size;
+ break;
+ }
+ so->so_rcv.sb_mb->m_data += size;
+ so->so_rcv.sb_mb->m_len -= size;
+ so->so_rcv.sb_cc -= size;
+ so->so_rcv.sb_datacc -= size;
+ } else {
+ *mp = so->so_rcv.sb_mb;
+ sbfree(&so->so_rcv, *mp);
+ so->so_rcv.sb_mb = (*mp)->m_next;
+ }
+ }
+ *mp = NULL;
+ SB_EMPTY_FIXUP(&so->so_rcv);
+ so->so_rcv.sb_lastrecord = so->so_rcv.sb_mb;
+
+ SBLASTRECORDCHK(&so->so_rcv, "somove");
+ SBLASTMBUFCHK(&so->so_rcv, "somove");
+ KDASSERT(m->m_nextpkt == NULL);
+ KASSERT(so->so_rcv.sb_mb == so->so_rcv.sb_lastrecord);
+#ifdef SOCKBUF_DEBUG
+ sbcheck(&so->so_rcv);
+#endif
+
+ /* Send window update to source peer if receive buffer has changed. */
+ if (m)
+ (so->so_proto->pr_usrreq)(so, PRU_RCVD, NULL,
+ (struct mbuf *)0L, NULL, NULL);
+
+ /* Receive buffer did shrink by len bytes, adjust oob. */
+ state = so->so_state;
+ so->so_state &= ~SS_RCVATMARK;
+ oobmark = so->so_oobmark;
+ so->so_oobmark = oobmark > len ? oobmark - len : 0;
+ if (oobmark) {
+ if (oobmark == len)
+ so->so_state |= SS_RCVATMARK;
+ if (oobmark >= len)
+ oobmark = 0;
+ }
+
+ /*
+ * Handle oob data. If any malloc fails, ignore error.
+ * TCP urgent data is not very reliable anyway.
+ */
+ while (m && ((state & SS_RCVATMARK) || oobmark) &&
+ (so->so_options & SO_OOBINLINE)) {
+ struct mbuf *o = NULL;
+
+ if (state & SS_RCVATMARK) {
+ o = m_get(wait, MT_DATA);
+ state &= ~SS_RCVATMARK;
+ } else if (oobmark) {
+ o = m_split(m, oobmark, wait);
+ if (o) {
+ error = (*sosp->so_proto->pr_usrreq)(sosp,
+ PRU_SEND, m, NULL, NULL, NULL);
+ m = NULL;
+ if (error) {
+ m_freem(o);
+ if (sosp->so_state & SS_CANTSENDMORE)
+ error = EPIPE;
+ goto release;
+ }
+ len -= oobmark;
+ so->so_splicelen += oobmark;
+ m = o;
+ o = m_get(wait, MT_DATA);
+ }
+ oobmark = 0;
+ }
+ if (o) {
+ o->m_len = 1;
+ *mtod(o, caddr_t) = *mtod(m, caddr_t);
+ error = (*sosp->so_proto->pr_usrreq)(sosp, PRU_SENDOOB,
+ o, NULL, NULL, NULL);
+ if (error) {
+ if (sosp->so_state & SS_CANTSENDMORE)
+ error = EPIPE;
+ goto release;
+ }
+ len -= 1;
+ so->so_splicelen += 1;
+ if (oobmark) {
+ oobmark -= 1;
+ if (oobmark == 0)
+ state |= SS_RCVATMARK;
+ }
+ m_adj(m, 1);
+ }
+ }
+
+ /* Append all remaining data to drain socket. */
+ if (m) {
+ if (so->so_rcv.sb_cc == 0)
+ sosp->so_state &= ~SS_ISSENDING;
+ error = (*sosp->so_proto->pr_usrreq)(sosp, PRU_SEND, m, NULL,
+ NULL, NULL);
+ m = NULL;
+ if (error) {
+ if (sosp->so_state & SS_CANTSENDMORE)
+ error = EPIPE;
+ goto release;
+ }
+ so->so_splicelen += len;
+ }
+
+ release:
+ if (m)
+ m_freem(m);
+ sosp->so_state &= ~SS_ISSENDING;
+ if (error)
+ so->so_error = error;
+ if (((so->so_state & SS_CANTRCVMORE) && so->so_rcv.sb_cc == 0) ||
+ (sosp->so_state & SS_CANTSENDMORE) || maxreached || error) {
+ sosp->so_snd.sb_flags &= ~SB_SPLICE;
+ so->so_rcv.sb_flags &= ~SB_SPLICE;
+ so->so_splice = sosp->so_spliceback = NULL;
+ if (soreadable(so))
+ sorwakeup(so);
+ return (0);
+ }
+ return (1);
+}
+
+void
+sorwakeup(struct socket *so)
+{
+ if (so->so_rcv.sb_flags & SB_SPLICE) {
+ (void) somove(so, M_DONTWAIT);
+ return;
+ }
+ _sorwakeup(so);
+}
+
+void
+sowwakeup(struct socket *so)
+{
+ if (so->so_snd.sb_flags & SB_SPLICE)
+ (void) somove(so->so_spliceback, M_DONTWAIT);
+ _sowwakeup(so);
+}
+#endif /* SOCKET_SPLICE */
+
int
sosetopt(struct socket *so, int level, int optname, struct mbuf *m0)
{
@@ -1096,6 +1424,23 @@ sosetopt(struct socket *so, int level, int optname, struct mbuf *m0)
break;
}
+#ifdef SOCKET_SPLICE
+ case SO_SPLICE:
+ if (m == NULL) {
+ error = sosplice(so, -1, 0);
+ } else if (m->m_len < sizeof(int)) {
+ error = EINVAL;
+ goto bad;
+ } else if (m->m_len < sizeof(struct splice)) {
+ error = sosplice(so, *mtod(m, int *), 0);
+ } else {
+ error = sosplice(so,
+ mtod(m, struct splice *)->sp_fd,
+ mtod(m, struct splice *)->sp_max);
+ }
+ break;
+#endif /* SOCKET_SPLICE */
+
default:
error = ENOPROTOOPT;
break;
@@ -1188,6 +1533,18 @@ sogetopt(struct socket *so, int level, int optname, struct mbuf **mp)
break;
}
+#ifdef SOCKET_SPLICE
+ case SO_SPLICE:
+ {
+ int s = splsoftnet();
+
+ m->m_len = sizeof(off_t);
+ *mtod(m, off_t *) = so->so_splicelen;
+ splx(s);
+ break;
+ }
+#endif /* SOCKET_SPLICE */
+
case SO_PEERCRED:
if (so->so_proto->pr_protocol == AF_UNIX) {
struct unpcb *unp = sotounpcb(so);
diff --git a/sys/netinet/in_proto.c b/sys/netinet/in_proto.c
index 7dd03ac2e51..ee8fff3e74b 100644
--- a/sys/netinet/in_proto.c
+++ b/sys/netinet/in_proto.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: in_proto.c,v 1.54 2010/08/29 09:24:38 gollo Exp $ */
+/* $OpenBSD: in_proto.c,v 1.55 2011/01/07 17:50:42 bluhm Exp $ */
/* $NetBSD: in_proto.c,v 1.14 1996/02/18 18:58:32 christos Exp $ */
/*
@@ -189,7 +189,7 @@ struct protosw inetsw[] = {
udp_usrreq,
udp_init, 0, 0, 0, udp_sysctl
},
-{ SOCK_STREAM, &inetdomain, IPPROTO_TCP, PR_CONNREQUIRED|PR_WANTRCVD|PR_ABRTACPTDIS,
+{ SOCK_STREAM, &inetdomain, IPPROTO_TCP, PR_CONNREQUIRED|PR_WANTRCVD|PR_ABRTACPTDIS|PR_SPLICE,
tcp_input, 0, tcp_ctlinput, tcp_ctloutput,
tcp_usrreq,
tcp_init, 0, tcp_slowtimo, 0, tcp_sysctl
diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c
index 66ad114f222..9a6f406456f 100644
--- a/sys/netinet/tcp_input.c
+++ b/sys/netinet/tcp_input.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: tcp_input.c,v 1.239 2010/09/29 19:42:11 claudio Exp $ */
+/* $OpenBSD: tcp_input.c,v 1.240 2011/01/07 17:50:42 bluhm Exp $ */
/* $NetBSD: tcp_input.c,v 1.23 1996/02/13 23:43:44 christos Exp $ */
/*
@@ -328,7 +328,9 @@ present:
pool_put(&tcpqe_pool, q);
q = nq;
} while (q != NULL && q->tcpqe_tcp->th_seq == tp->rcv_nxt);
+ tp->t_flags |= TF_BLOCKOUTPUT;
sorwakeup(so);
+ tp->t_flags &= ~TF_BLOCKOUTPUT;
return (flags);
}
@@ -368,7 +370,7 @@ tcp_input(struct mbuf *m, ...)
struct tcpcb *tp = 0;
int tiflags;
struct socket *so = NULL;
- int todrop, acked, ourfinisacked, needoutput = 0;
+ int todrop, acked, ourfinisacked;
int hdroptlen = 0;
short ostate = 0;
tcp_seq iss, *reuse = NULL;
@@ -1090,9 +1092,13 @@ after_listen:
TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
tcp_update_sndspace(tp);
- if (sb_notify(&so->so_snd))
+ if (sb_notify(&so->so_snd)) {
+ tp->t_flags |= TF_BLOCKOUTPUT;
sowwakeup(so);
- if (so->so_snd.sb_cc)
+ tp->t_flags &= ~TF_BLOCKOUTPUT;
+ }
+ if (so->so_snd.sb_cc ||
+ tp->t_flags & TF_NEEDOUTPUT)
(void) tcp_output(tp);
return;
}
@@ -1136,8 +1142,10 @@ after_listen:
m_adj(m, iphlen + off);
sbappendstream(&so->so_rcv, m);
}
+ tp->t_flags |= TF_BLOCKOUTPUT;
sorwakeup(so);
- if (tp->t_flags & TF_ACKNOW)
+ tp->t_flags &= ~TF_BLOCKOUTPUT;
+ if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT))
(void) tcp_output(tp);
return;
}
@@ -1773,10 +1781,10 @@ trimthenstep6:
#if defined(TCP_SACK) && defined(TCP_FACK)
/* Force call to tcp_output */
if (tp->snd_awnd < tp->snd_cwnd)
- needoutput = 1;
+ tp->t_flags |= TF_NEEDOUTPUT;
#else
tp->snd_cwnd += tp->t_maxseg;
- needoutput = 1;
+ tp->t_flags |= TF_NEEDOUTPUT;
#endif /* TCP_FACK */
} else {
/* Out of fast recovery */
@@ -1844,7 +1852,7 @@ trimthenstep6:
*/
if (th->th_ack == tp->snd_max) {
TCP_TIMER_DISARM(tp, TCPT_REXMT);
- needoutput = 1;
+ tp->t_flags |= TF_NEEDOUTPUT;
} else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0)
TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
/*
@@ -1877,8 +1885,11 @@ trimthenstep6:
}
tcp_update_sndspace(tp);
- if (sb_notify(&so->so_snd))
+ if (sb_notify(&so->so_snd)) {
+ tp->t_flags |= TF_BLOCKOUTPUT;
sowwakeup(so);
+ tp->t_flags &= ~TF_BLOCKOUTPUT;
+ }
/*
* If we had a pending ICMP message that referred to data
@@ -1996,7 +2007,7 @@ step6:
tp->snd_wl2 = th->th_ack;
if (tp->snd_wnd > tp->max_sndwnd)
tp->max_sndwnd = tp->snd_wnd;
- needoutput = 1;
+ tp->t_flags |= TF_NEEDOUTPUT;
}
/*
@@ -2088,7 +2099,9 @@ dodata: /* XXX */
m_adj(m, hdroptlen);
sbappendstream(&so->so_rcv, m);
}
+ tp->t_flags |= TF_BLOCKOUTPUT;
sorwakeup(so);
+ tp->t_flags &= ~TF_BLOCKOUTPUT;
} else {
m_adj(m, hdroptlen);
tiflags = tcp_reass(tp, th, m, &tlen);
@@ -2182,9 +2195,8 @@ dodata: /* XXX */
/*
* Return any desired output.
*/
- if (needoutput || (tp->t_flags & TF_ACKNOW)) {
+ if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT))
(void) tcp_output(tp);
- }
return;
badsyn:
diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c
index 08d93835a4a..868b14c65e1 100644
--- a/sys/netinet/tcp_output.c
+++ b/sys/netinet/tcp_output.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: tcp_output.c,v 1.92 2010/09/24 02:59:45 claudio Exp $ */
+/* $OpenBSD: tcp_output.c,v 1.93 2011/01/07 17:50:42 bluhm Exp $ */
/* $NetBSD: tcp_output.c,v 1.16 1997/06/03 16:17:09 kml Exp $ */
/*
@@ -228,6 +228,12 @@ tcp_output(struct tcpcb *tp)
int needect;
#endif
+ if (tp->t_flags & TF_BLOCKOUTPUT) {
+ tp->t_flags |= TF_NEEDOUTPUT;
+ return (0);
+ } else
+ tp->t_flags &= ~TF_NEEDOUTPUT;
+
#if defined(TCP_SACK) && defined(TCP_SIGNATURE) && defined(DIAGNOSTIC)
if (tp->sack_enable && (tp->t_flags & TF_SIGNATURE))
return (EINVAL);
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
index 6ded3dd733f..0573c7d966b 100644
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: tcp_var.h,v 1.97 2010/10/21 11:38:27 bluhm Exp $ */
+/* $OpenBSD: tcp_var.h,v 1.98 2011/01/07 17:50:42 bluhm Exp $ */
/* $NetBSD: tcp_var.h,v 1.17 1996/02/13 23:44:24 christos Exp $ */
/*
@@ -95,6 +95,8 @@ struct tcpcb {
#define TF_LASTIDLE 0x00100000 /* no outstanding ACK on last send */
#define TF_DEAD 0x00200000 /* dead and to-be-released */
#define TF_PMTUD_PEND 0x00400000 /* Path MTU Discovery pending */
+#define TF_NEEDOUTPUT 0x00800000 /* call tcp_output after tcp_input */
+#define TF_BLOCKOUTPUT 0x01000000 /* avert tcp_output during tcp_input */
struct mbuf *t_template; /* skeletal packet for transmit */
struct inpcb *t_inpcb; /* back pointer to internet pcb */
diff --git a/sys/netinet6/in6_proto.c b/sys/netinet6/in6_proto.c
index 11c29783988..7d40de9db8f 100644
--- a/sys/netinet6/in6_proto.c
+++ b/sys/netinet6/in6_proto.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: in6_proto.c,v 1.59 2010/07/08 19:42:46 jsg Exp $ */
+/* $OpenBSD: in6_proto.c,v 1.60 2011/01/07 17:50:42 bluhm Exp $ */
/* $KAME: in6_proto.c,v 1.66 2000/10/10 15:35:47 itojun Exp $ */
/*
@@ -137,7 +137,7 @@ struct ip6protosw inet6sw[] = {
0, 0, 0,
udp_sysctl,
},
-{ SOCK_STREAM, &inet6domain, IPPROTO_TCP, PR_CONNREQUIRED|PR_WANTRCVD|PR_ABRTACPTDIS,
+{ SOCK_STREAM, &inet6domain, IPPROTO_TCP, PR_CONNREQUIRED|PR_WANTRCVD|PR_ABRTACPTDIS|PR_SPLICE,
tcp6_input, 0, tcp6_ctlinput, tcp_ctloutput,
tcp_usrreq,
#ifdef INET /* don't call initialization and timeout routines twice */
diff --git a/sys/sys/protosw.h b/sys/sys/protosw.h
index 40a362e79d9..ea47dbc22e6 100644
--- a/sys/sys/protosw.h
+++ b/sys/sys/protosw.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: protosw.h,v 1.14 2010/10/18 04:31:01 guenther Exp $ */
+/* $OpenBSD: protosw.h,v 1.15 2011/01/07 17:50:42 bluhm Exp $ */
/* $NetBSD: protosw.h,v 1.10 1996/04/09 20:55:32 cgd Exp $ */
/*-
@@ -106,6 +106,7 @@ struct protosw {
#define PR_RIGHTS 0x10 /* passes capabilities */
#define PR_ABRTACPTDIS 0x20 /* abort on accept(2) to disconnected
socket */
+#define PR_SPLICE 0x40 /* socket splicing is possible */
/*
* The arguments to usrreq are:
diff --git a/sys/sys/socket.h b/sys/sys/socket.h
index 2c9d895a334..f8454d7d4dd 100644
--- a/sys/sys/socket.h
+++ b/sys/sys/socket.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: socket.h,v 1.70 2010/07/05 22:20:22 tedu Exp $ */
+/* $OpenBSD: socket.h,v 1.71 2011/01/07 17:50:42 bluhm Exp $ */
/* $NetBSD: socket.h,v 1.14 1996/02/09 18:25:36 christos Exp $ */
/*
@@ -87,6 +87,7 @@
#define SO_NETPROC 0x1020 /* multiplex; network processing */
#define SO_RTABLE 0x1021 /* routing table to be used */
#define SO_PEERCRED 0x1022 /* get connect-time credentials */
+#define SO_SPLICE 0x1023 /* splice data to other socket */
/*
* Structure used for manipulating linger option.
@@ -97,6 +98,14 @@ struct linger {
};
/*
+ * Structure used for manipulating splice option.
+ */
+struct splice {
+ int sp_fd; /* drain socket file descriptor */
+ off_t sp_max; /* if set, maximum bytes to splice */
+};
+
+/*
* Level number for (get/set)sockopt() to apply to socket itself.
*/
#define SOL_SOCKET 0xffff /* options for socket level */
diff --git a/sys/sys/socketvar.h b/sys/sys/socketvar.h
index 81c6e17267b..fbe2a7a48e8 100644
--- a/sys/sys/socketvar.h
+++ b/sys/sys/socketvar.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: socketvar.h,v 1.47 2010/09/24 02:59:46 claudio Exp $ */
+/* $OpenBSD: socketvar.h,v 1.48 2011/01/07 17:50:42 bluhm Exp $ */
/* $NetBSD: socketvar.h,v 1.18 1996/02/09 18:25:38 christos Exp $ */
/*-
@@ -75,6 +75,13 @@ struct socket {
uid_t so_siguid; /* uid of process who set so_pgid */
uid_t so_sigeuid; /* euid of process who set so_pgid */
u_long so_oobmark; /* chars to oob mark */
+
+#if 1 /*def SOCKET_SPLICE*/
+ struct socket *so_splice; /* send data to drain socket */
+ struct socket *so_spliceback; /* back ref for notify and cleanup */
+ off_t so_splicelen; /* number of bytes spliced so far */
+ off_t so_splicemax; /* maximum number of bytes to splice */
+#endif /* SOCKET_SPLICE */
/*
* Variables for socket buffering.
*/
@@ -102,6 +109,7 @@ struct socket {
#define SB_ASYNC 0x10 /* ASYNC I/O, need signals */
#define SB_NOINTR 0x40 /* operations not interruptible */
#define SB_KNOTE 0x80 /* kernel note attached */
+#define SB_SPLICE 0x0100 /* buffer is splice source or drain */
void *so_internal; /* Space for svr4 stream data */
void (*so_upcall)(struct socket *so, caddr_t arg, int waitf);
@@ -145,7 +153,7 @@ struct socket {
* Do we need to notify the other side when I/O is possible?
*/
#define sb_notify(sb) (((sb)->sb_flags & (SB_WAIT|SB_SEL|SB_ASYNC| \
- SB_KNOTE)) != 0)
+ SB_KNOTE|SB_SPLICE)) != 0)
/*
* How much space is there in a socket buffer (so->so_snd or so->so_rcv)?
@@ -165,7 +173,7 @@ struct socket {
((so)->so_state & SS_ISSENDING)
/* can we read something from so? */
-#define soreadable(so) \
+#define _soreadable(so) \
((so)->so_rcv.sb_cc >= (so)->so_rcv.sb_lowat || \
((so)->so_state & SS_CANTRCVMORE) || \
(so)->so_qlen || (so)->so_error)
@@ -215,14 +223,24 @@ struct socket {
} \
} while (/* CONSTCOND */ 0)
-#define sorwakeup(so) do { \
+#define _sorwakeup(so) do { \
sowakeup((so), &(so)->so_rcv); \
if ((so)->so_upcall) \
(*((so)->so_upcall))((so), (so)->so_upcallarg, \
M_DONTWAIT); \
} while (/* CONSTCOND */ 0)
-#define sowwakeup(so) sowakeup((so), &(so)->so_snd)
+#define _sowwakeup(so) sowakeup((so), &(so)->so_snd)
+
+#ifdef SOCKET_SPLICE
+#define soreadable(so) ((so)->so_splice == NULL && _soreadable(so))
+void sorwakeup(struct socket *);
+void sowwakeup(struct socket *);
+#else /* SOCKET_SPLICE */
+#define soreadable(so) _soreadable(so)
+#define sorwakeup(so) _sorwakeup(so)
+#define sowwakeup(so) _sowwakeup(so)
+#endif /* SOCKET_SPLICE */
#ifdef _KERNEL
extern u_long sb_max;