summaryrefslogtreecommitdiff
path: root/sys/kern/uipc_socket.c
diff options
context:
space:
mode:
authorVitaliy Makkoveev <mvs@cvs.openbsd.org>2024-07-20 17:26:20 +0000
committerVitaliy Makkoveev <mvs@cvs.openbsd.org>2024-07-20 17:26:20 +0000
commitd315ab526c7f9ea549df7384e58c3ce848892634 (patch)
tree56d284ebdaa2c6f88a06ecbdebbed7bc67a2e8da /sys/kern/uipc_socket.c
parent8e0aecedc0f5db86c589d5e919b2f8e8263fd593 (diff)
Unlock udp(4) somove().
Socket splicing belongs to sockets buffers. udp(4) sockets are fully switched to fine-grained buffers locks, so use them instead of exclusive solock(). Always schedule somove() thread to run as we do for tcp(4) case. This brings delay to packet processing, but it is comparable wit non splicing case where soreceive() threads are always scheduled. So, now spliced udp(4) sockets rely on sb_lock() of `so_rcv' buffer together with `sb_mtx' mutexes of both buffers. Shared solock() only required around pru_send() call, so the most of somove() thread runs simultaneously with network stack. Also document 'sosplice' structure locking. Feedback, tests and OK from bluhm.
Diffstat (limited to 'sys/kern/uipc_socket.c')
-rw-r--r--sys/kern/uipc_socket.c300
1 files changed, 219 insertions, 81 deletions
diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c
index a17ac58eaba..52157918bb6 100644
--- a/sys/kern/uipc_socket.c
+++ b/sys/kern/uipc_socket.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: uipc_socket.c,v 1.338 2024/07/14 15:42:23 bluhm Exp $ */
+/* $OpenBSD: uipc_socket.c,v 1.339 2024/07/20 17:26:19 mvs Exp $ */
/* $NetBSD: uipc_socket.c,v 1.21 1996/02/04 02:17:52 christos Exp $ */
/*
@@ -324,31 +324,22 @@ sofree(struct socket *so, int keep_lock)
sounlock(head);
}
- if (persocket) {
+ switch (so->so_proto->pr_domain->dom_family) {
+ case AF_INET:
+ case AF_INET6:
+ if (so->so_proto->pr_type == SOCK_STREAM)
+ break;
+ /* FALLTHROUGH */
+ default:
sounlock(so);
refcnt_finalize(&so->so_refcnt, "sofinal");
solock(so);
+ break;
}
sigio_free(&so->so_sigio);
klist_free(&so->so_rcv.sb_klist);
klist_free(&so->so_snd.sb_klist);
-#ifdef SOCKET_SPLICE
- if (issplicedback(so)) {
- int freeing = SOSP_FREEING_WRITE;
-
- if (so->so_sp->ssp_soback == so)
- freeing |= SOSP_FREEING_READ;
- sounsplice(so->so_sp->ssp_soback, so, freeing);
- }
- if (isspliced(so)) {
- int freeing = SOSP_FREEING_READ;
-
- if (so == so->so_sp->ssp_socket)
- freeing |= SOSP_FREEING_WRITE;
- sounsplice(so, so->so_sp->ssp_socket, freeing);
- }
-#endif /* SOCKET_SPLICE */
mtx_enter(&so->so_snd.sb_mtx);
sbrelease(so, &so->so_snd);
@@ -458,6 +449,85 @@ discard:
if (so->so_state & SS_NOFDREF)
panic("soclose NOFDREF: so %p, so_type %d", so, so->so_type);
so->so_state |= SS_NOFDREF;
+
+#ifdef SOCKET_SPLICE
+ if (so->so_sp) {
+ struct socket *soback;
+
+ if (so->so_proto->pr_flags & PR_WANTRCVD) {
+ /*
+ * Copy - Paste, but can't relock and sleep in
+ * sofree() in tcp(4) case. That's why tcp(4)
+ * still rely on solock() for splicing and
+ * unsplicing.
+ */
+
+ if (issplicedback(so)) {
+ int freeing = SOSP_FREEING_WRITE;
+
+ if (so->so_sp->ssp_soback == so)
+ freeing |= SOSP_FREEING_READ;
+ sounsplice(so->so_sp->ssp_soback, so, freeing);
+ }
+ if (isspliced(so)) {
+ int freeing = SOSP_FREEING_READ;
+
+ if (so == so->so_sp->ssp_socket)
+ freeing |= SOSP_FREEING_WRITE;
+ sounsplice(so, so->so_sp->ssp_socket, freeing);
+ }
+ goto free;
+ }
+
+ sounlock(so);
+ mtx_enter(&so->so_snd.sb_mtx);
+ /*
+ * Concurrent sounsplice() locks `sb_mtx' mutexes on
+ * both `so_snd' and `so_rcv' before unsplice sockets.
+ */
+ if ((soback = so->so_sp->ssp_soback) == NULL) {
+ mtx_leave(&so->so_snd.sb_mtx);
+ goto notsplicedback;
+ }
+ soref(soback);
+ mtx_leave(&so->so_snd.sb_mtx);
+
+ /*
+ * `so' can be only unspliced, and never spliced again.
+ * Thus if issplicedback(so) check is positive, socket is
+ * still spliced and `ssp_soback' points to the same
+ * socket that `soback'.
+ */
+ sblock(&soback->so_rcv, SBL_WAIT | SBL_NOINTR);
+ if (issplicedback(so)) {
+ int freeing = SOSP_FREEING_WRITE;
+
+ if (so->so_sp->ssp_soback == so)
+ freeing |= SOSP_FREEING_READ;
+ solock(soback);
+ sounsplice(so->so_sp->ssp_soback, so, freeing);
+ sounlock(soback);
+ }
+ sbunlock(&soback->so_rcv);
+ sorele(soback);
+
+notsplicedback:
+ sblock(&so->so_rcv, SBL_WAIT | SBL_NOINTR);
+ if (isspliced(so)) {
+ int freeing = SOSP_FREEING_READ;
+
+ if (so == so->so_sp->ssp_socket)
+ freeing |= SOSP_FREEING_WRITE;
+ solock(so);
+ sounsplice(so, so->so_sp->ssp_socket, freeing);
+ sounlock(so);
+ }
+ sbunlock(&so->so_rcv);
+
+ solock(so);
+ }
+free:
+#endif /* SOCKET_SPLICE */
/* sofree() calls sounlock(). */
sofree(so, 0);
return (error);
@@ -1411,14 +1481,6 @@ sosplice(struct socket *so, int fd, off_t max, struct timeval *tv)
goto release;
}
- /* Splice so and sosp together. */
- mtx_enter(&so->so_rcv.sb_mtx);
- mtx_enter(&sosp->so_snd.sb_mtx);
- so->so_sp->ssp_socket = sosp;
- sosp->so_sp->ssp_soback = so;
- mtx_leave(&sosp->so_snd.sb_mtx);
- mtx_leave(&so->so_rcv.sb_mtx);
-
so->so_splicelen = 0;
so->so_splicemax = max;
if (tv)
@@ -1429,9 +1491,20 @@ sosplice(struct socket *so, int fd, off_t max, struct timeval *tv)
task_set(&so->so_splicetask, sotask, so);
/*
- * To prevent softnet interrupt from calling somove() while
- * we sleep, the socket buffers are not marked as spliced yet.
+ * To prevent sorwakeup() calling somove() before this somove()
+ * has finished, the socket buffers are not marked as spliced yet.
*/
+
+ /* Splice so and sosp together. */
+ mtx_enter(&so->so_rcv.sb_mtx);
+ mtx_enter(&sosp->so_snd.sb_mtx);
+ so->so_sp->ssp_socket = sosp;
+ sosp->so_sp->ssp_soback = so;
+ mtx_leave(&sosp->so_snd.sb_mtx);
+ mtx_leave(&so->so_rcv.sb_mtx);
+
+ if ((so->so_proto->pr_flags & PR_WANTRCVD) == 0)
+ sounlock(so);
if (somove(so, M_WAIT)) {
mtx_enter(&so->so_rcv.sb_mtx);
mtx_enter(&sosp->so_snd.sb_mtx);
@@ -1440,6 +1513,8 @@ sosplice(struct socket *so, int fd, off_t max, struct timeval *tv)
mtx_leave(&sosp->so_snd.sb_mtx);
mtx_leave(&so->so_rcv.sb_mtx);
}
+ if ((so->so_proto->pr_flags & PR_WANTRCVD) == 0)
+ solock(so);
release:
sounlock(so);
@@ -1454,6 +1529,8 @@ sosplice(struct socket *so, int fd, off_t max, struct timeval *tv)
void
sounsplice(struct socket *so, struct socket *sosp, int freeing)
{
+ if ((so->so_proto->pr_flags & PR_WANTRCVD) == 0)
+ sbassertlocked(&so->so_rcv);
soassertlocked(so);
task_del(sosplice_taskq, &so->so_splicetask);
@@ -1479,32 +1556,51 @@ soidle(void *arg)
{
struct socket *so = arg;
+ sblock(&so->so_rcv, SBL_WAIT | SBL_NOINTR);
solock(so);
+ /*
+ * Depending on socket type, sblock(&so->so_rcv) or solock()
+ * is always held while modifying SB_SPLICE and
+ * so->so_sp->ssp_socket.
+ */
if (so->so_rcv.sb_flags & SB_SPLICE) {
so->so_error = ETIMEDOUT;
sounsplice(so, so->so_sp->ssp_socket, 0);
}
sounlock(so);
+ sbunlock(&so->so_rcv);
}
void
sotask(void *arg)
{
struct socket *so = arg;
+ int doyield = 0;
+ int sockstream = (so->so_proto->pr_flags & PR_WANTRCVD);
+
+ /*
+ * sblock() on `so_rcv' protects sockets from beind unspliced
+ * for UDP case. TCP sockets still rely on solock().
+ */
+
+ sblock(&so->so_rcv, SBL_WAIT | SBL_NOINTR);
+ if (sockstream)
+ solock(so);
- solock(so);
if (so->so_rcv.sb_flags & SB_SPLICE) {
- /*
- * We may not sleep here as sofree() and unsplice() may be
- * called from softnet interrupt context. This would remove
- * the socket during somove().
- */
+ if (sockstream)
+ doyield = 1;
somove(so, M_DONTWAIT);
}
- sounlock(so);
- /* Avoid user land starvation. */
- yield();
+ if (sockstream)
+ sounlock(so);
+ sbunlock(&so->so_rcv);
+
+ if (doyield) {
+ /* Avoid user land starvation. */
+ yield();
+ }
}
/*
@@ -1546,24 +1642,32 @@ somove(struct socket *so, int wait)
struct mbuf *m, **mp, *nextrecord;
u_long len, off, oobmark;
long space;
- int error = 0, maxreached = 0;
+ int error = 0, maxreached = 0, unsplice = 0;
unsigned int rcvstate;
+ int sockdgram = ((so->so_proto->pr_flags &
+ PR_WANTRCVD) == 0);
- soassertlocked(so);
+ if (sockdgram)
+ sbassertlocked(&so->so_rcv);
+ else
+ soassertlocked(so);
+
+ mtx_enter(&so->so_rcv.sb_mtx);
+ mtx_enter(&sosp->so_snd.sb_mtx);
nextpkt:
- if (so->so_error) {
- error = so->so_error;
+ if ((error = READ_ONCE(so->so_error)))
goto release;
- }
if (sosp->so_snd.sb_state & SS_CANTSENDMORE) {
error = EPIPE;
goto release;
}
- if (sosp->so_error && sosp->so_error != ETIMEDOUT &&
- sosp->so_error != EFBIG && sosp->so_error != ELOOP) {
- error = sosp->so_error;
- goto release;
+
+ error = READ_ONCE(sosp->so_error);
+ if (error) {
+ if (error != ETIMEDOUT && error != EFBIG && error != ELOOP)
+ goto release;
+ error = 0;
}
if ((sosp->so_state & SS_ISCONNECTED) == 0)
goto release;
@@ -1577,26 +1681,21 @@ somove(struct socket *so, int wait)
maxreached = 1;
}
}
- mtx_enter(&sosp->so_snd.sb_mtx);
space = sbspace_locked(sosp, &sosp->so_snd);
if (so->so_oobmark && so->so_oobmark < len &&
so->so_oobmark < space + 1024)
space += 1024;
if (space <= 0) {
- mtx_leave(&sosp->so_snd.sb_mtx);
maxreached = 0;
goto release;
}
if (space < len) {
maxreached = 0;
- if (space < sosp->so_snd.sb_lowat) {
- mtx_leave(&sosp->so_snd.sb_mtx);
+ if (space < sosp->so_snd.sb_lowat)
goto release;
- }
len = space;
}
sosp->so_snd.sb_state |= SS_ISSENDING;
- mtx_leave(&sosp->so_snd.sb_mtx);
SBLASTRECORDCHK(&so->so_rcv, "somove 1");
SBLASTMBUFCHK(&so->so_rcv, "somove 1");
@@ -1618,8 +1717,13 @@ somove(struct socket *so, int wait)
m = m->m_next;
if (m == NULL) {
sbdroprecord(so, &so->so_rcv);
- if (so->so_proto->pr_flags & PR_WANTRCVD)
+ if (so->so_proto->pr_flags & PR_WANTRCVD) {
+ mtx_leave(&sosp->so_snd.sb_mtx);
+ mtx_leave(&so->so_rcv.sb_mtx);
pru_rcvd(so);
+ mtx_enter(&so->so_rcv.sb_mtx);
+ mtx_enter(&sosp->so_snd.sb_mtx);
+ }
goto nextpkt;
}
@@ -1724,11 +1828,15 @@ somove(struct socket *so, int wait)
}
/* Send window update to source peer as receive buffer has changed. */
- if (so->so_proto->pr_flags & PR_WANTRCVD)
+ if (so->so_proto->pr_flags & PR_WANTRCVD) {
+ mtx_leave(&sosp->so_snd.sb_mtx);
+ mtx_leave(&so->so_rcv.sb_mtx);
pru_rcvd(so);
+ mtx_enter(&so->so_rcv.sb_mtx);
+ mtx_enter(&sosp->so_snd.sb_mtx);
+ }
/* Receive buffer did shrink by len bytes, adjust oob. */
- mtx_enter(&so->so_rcv.sb_mtx);
rcvstate = so->so_rcv.sb_state;
so->so_rcv.sb_state &= ~SS_RCVATMARK;
oobmark = so->so_oobmark;
@@ -1739,7 +1847,6 @@ somove(struct socket *so, int wait)
if (oobmark >= len)
oobmark = 0;
}
- mtx_leave(&so->so_rcv.sb_mtx);
/*
* Handle oob data. If any malloc fails, ignore error.
@@ -1755,7 +1862,12 @@ somove(struct socket *so, int wait)
} else if (oobmark) {
o = m_split(m, oobmark, wait);
if (o) {
+ mtx_leave(&sosp->so_snd.sb_mtx);
+ mtx_leave(&so->so_rcv.sb_mtx);
error = pru_send(sosp, m, NULL, NULL);
+ mtx_enter(&so->so_rcv.sb_mtx);
+ mtx_enter(&sosp->so_snd.sb_mtx);
+
if (error) {
if (sosp->so_snd.sb_state &
SS_CANTSENDMORE)
@@ -1773,7 +1885,13 @@ somove(struct socket *so, int wait)
if (o) {
o->m_len = 1;
*mtod(o, caddr_t) = *mtod(m, caddr_t);
+
+ mtx_leave(&sosp->so_snd.sb_mtx);
+ mtx_leave(&so->so_rcv.sb_mtx);
error = pru_sendoob(sosp, o, NULL, NULL);
+ mtx_enter(&so->so_rcv.sb_mtx);
+ mtx_enter(&sosp->so_snd.sb_mtx);
+
if (error) {
if (sosp->so_snd.sb_state & SS_CANTSENDMORE)
error = EPIPE;
@@ -1791,15 +1909,25 @@ somove(struct socket *so, int wait)
}
}
- mtx_enter(&sosp->so_snd.sb_mtx);
/* Append all remaining data to drain socket. */
if (so->so_rcv.sb_cc == 0 || maxreached)
sosp->so_snd.sb_state &= ~SS_ISSENDING;
+
mtx_leave(&sosp->so_snd.sb_mtx);
+ mtx_leave(&so->so_rcv.sb_mtx);
+ if (sockdgram)
+ solock_shared(sosp);
error = pru_send(sosp, m, NULL, NULL);
+ if (sockdgram)
+ sounlock_shared(sosp);
+
+ mtx_enter(&so->so_rcv.sb_mtx);
+ mtx_enter(&sosp->so_snd.sb_mtx);
+
if (error) {
- if (sosp->so_snd.sb_state & SS_CANTSENDMORE)
+ if (sosp->so_snd.sb_state & SS_CANTSENDMORE ||
+ sosp->so_pcb == NULL)
error = EPIPE;
goto release;
}
@@ -1810,26 +1938,35 @@ somove(struct socket *so, int wait)
goto nextpkt;
release:
- mtx_enter(&sosp->so_snd.sb_mtx);
sosp->so_snd.sb_state &= ~SS_ISSENDING;
- mtx_leave(&sosp->so_snd.sb_mtx);
if (!error && maxreached && so->so_splicemax == so->so_splicelen)
error = EFBIG;
if (error)
- so->so_error = error;
+ WRITE_ONCE(so->so_error, error);
+
if (((so->so_rcv.sb_state & SS_CANTRCVMORE) &&
so->so_rcv.sb_cc == 0) ||
(sosp->so_snd.sb_state & SS_CANTSENDMORE) ||
- maxreached || error) {
+ maxreached || error)
+ unsplice = 1;
+
+ mtx_leave(&sosp->so_snd.sb_mtx);
+ mtx_leave(&so->so_rcv.sb_mtx);
+
+ if (unsplice) {
+ if (sockdgram)
+ solock(so);
sounsplice(so, sosp, 0);
+ if (sockdgram)
+ sounlock(so);
+
return (0);
}
if (timerisset(&so->so_idletv))
timeout_add_tv(&so->so_idleto, &so->so_idletv);
return (1);
}
-
#endif /* SOCKET_SPLICE */
void
@@ -1839,22 +1976,16 @@ sorwakeup(struct socket *so)
soassertlocked_readonly(so);
#ifdef SOCKET_SPLICE
- if (so->so_rcv.sb_flags & SB_SPLICE) {
- /*
- * TCP has a sendbuffer that can handle multiple packets
- * at once. So queue the stream a bit to accumulate data.
- * The sosplice thread will call somove() later and send
- * the packets calling tcp_output() only once.
- * In the UDP case, send out the packets immediately.
- * Using a thread would make things slower.
- */
- if (so->so_proto->pr_flags & PR_WANTRCVD)
+ if (so->so_proto->pr_flags & PR_SPLICE) {
+ sb_mtx_lock(&so->so_rcv);
+ if (so->so_rcv.sb_flags & SB_SPLICE)
task_add(sosplice_taskq, &so->so_splicetask);
- else
- somove(so, M_DONTWAIT);
+ if (isspliced(so)) {
+ sb_mtx_unlock(&so->so_rcv);
+ return;
+ }
+ sb_mtx_unlock(&so->so_rcv);
}
- if (isspliced(so))
- return;
#endif
sowakeup(so, &so->so_rcv);
if (so->so_upcall)
@@ -1868,10 +1999,17 @@ sowwakeup(struct socket *so)
soassertlocked_readonly(so);
#ifdef SOCKET_SPLICE
- if (so->so_snd.sb_flags & SB_SPLICE)
- task_add(sosplice_taskq, &so->so_sp->ssp_soback->so_splicetask);
- if (issplicedback(so))
- return;
+ if (so->so_proto->pr_flags & PR_SPLICE) {
+ sb_mtx_lock(&so->so_snd);
+ if (so->so_snd.sb_flags & SB_SPLICE)
+ task_add(sosplice_taskq,
+ &so->so_sp->ssp_soback->so_splicetask);
+ if (issplicedback(so)) {
+ sb_mtx_unlock(&so->so_snd);
+ return;
+ }
+ sb_mtx_unlock(&so->so_snd);
+ }
#endif
sowakeup(so, &so->so_snd);
}