From 893ef53ee7763c2392fa5cd0397f94ad645f1c99 Mon Sep 17 00:00:00 2001 From: cheloha Date: Sat, 26 Dec 2020 16:30:59 +0000 Subject: bpf(4): bpf_d struct: replace bd_rdStart member with bd_nreaders member bd_rdStart is strange. It nominally represents the start of a read(2) on a given bpf(4) descriptor, but there are several problems with it: 1. If there are multiple readers, the bd_rdStart is not set by subsequent readers, so their timeout is screwed up. The read timeout should really be tracked on a per-thread basis in bpfread(). 2. We set bd_rdStart for poll(2), select(2), and kevent(2), even though that makes no sense. We should not be setting bd_rdStart in bpfpoll() or bpfkqfilter(). 3. bd_rdStart is buggy. If ticks is 0 when the read starts then bpf_catchpacket() won't wake up the reader. This is a problem inherent to the design of bd_rdStart: it serves as both a boolean and a scalar value, even though 0 is a valid value in the scalar range. So let's replace it with a better struct member. "bd_nreaders" is a count of threads sleeping in bpfread(). It is incremented before a thread goes to sleep in bpfread() and decremented when a thread wakes up. If bd_nreaders is greater than zero when we reach bpf_catchpacket() and fbuf is non-NULL we wake up all readers. The read timeout, if any, is now tracked locally by the thread in bpfread(). Unlike bd_rdStart, bpfpoll() and bpfkqfilter() don't touch bd_nreaders. Prompted by mpi@. Basic idea from dlg@. Lots of input from dlg@. Tested by dlg@ with tcpdump(8) (blocking read) and flow-collector (https://github.com/eait-itig/flow-collector, non-blocking read). ok dlg@ --- sys/net/bpf.c | 56 +++++++++++++++++++++++--------------------------------- 1 file changed, 23 insertions(+), 33 deletions(-) (limited to 'sys/net/bpf.c') diff --git a/sys/net/bpf.c b/sys/net/bpf.c index 742a8570f13..1ac05458f60 100644 --- a/sys/net/bpf.c +++ b/sys/net/bpf.c @@ -1,4 +1,4 @@ -/* $OpenBSD: bpf.c,v 1.198 2020/12/25 12:59:53 visa Exp $ */ +/* $OpenBSD: bpf.c,v 1.199 2020/12/26 16:30:58 cheloha Exp $ */ /* $NetBSD: bpf.c,v 1.33 1997/02/21 23:59:35 thorpej Exp $ */ /* @@ -430,7 +430,7 @@ bpfread(dev_t dev, struct uio *uio, int ioflag) { struct bpf_d *d; caddr_t hbuf; - int hlen, error; + int end, error, hlen, nticks; KERNEL_ASSERT_LOCKED(); @@ -451,13 +451,10 @@ bpfread(dev_t dev, struct uio *uio, int ioflag) } /* - * If there's a timeout, bd_rdStart is tagged when we start the read. - * we can then figure out when we're done reading. + * If there's a timeout, mark when the read should end. */ - if (d->bd_rnonblock == 0 && d->bd_rdStart == 0) - d->bd_rdStart = ticks; - else - d->bd_rdStart = 0; + if (d->bd_rtout) + end = ticks + (int)d->bd_rtout; /* * If the hold buffer is empty, then do a timed sleep, which @@ -486,13 +483,21 @@ bpfread(dev_t dev, struct uio *uio, int ioflag) if (d->bd_rnonblock) { /* User requested non-blocking I/O */ error = EWOULDBLOCK; + } else if (d->bd_rtout == 0) { + /* No read timeout set. */ + d->bd_nreaders++; + error = msleep_nsec(d, &d->bd_mtx, PRINET|PCATCH, + "bpf", INFSLP); + d->bd_nreaders--; + } else if ((nticks = end - ticks) > 0) { + /* Read timeout has not expired yet. */ + d->bd_nreaders++; + error = msleep(d, &d->bd_mtx, PRINET|PCATCH, "bpf", + nticks); + d->bd_nreaders--; } else { - if (d->bd_rdStart <= ULONG_MAX - d->bd_rtout && - d->bd_rdStart + d->bd_rtout < ticks) { - error = msleep(d, &d->bd_mtx, PRINET|PCATCH, - "bpf", d->bd_rtout); - } else - error = EWOULDBLOCK; + /* Read timeout has expired. */ + error = EWOULDBLOCK; } if (error == EINTR || error == ERESTART) goto out; @@ -1153,15 +1158,8 @@ bpfpoll(dev_t dev, int events, struct proc *p) mtx_enter(&d->bd_mtx); if (d->bd_hlen != 0 || (d->bd_immediate && d->bd_slen != 0)) revents |= events & (POLLIN | POLLRDNORM); - else { - /* - * if there's a timeout, mark the time we - * started waiting. - */ - if (d->bd_rnonblock == 0 && d->bd_rdStart == 0) - d->bd_rdStart = ticks; + else selrecord(p, &d->bd_sel); - } mtx_leave(&d->bd_mtx); } return (revents); @@ -1197,11 +1195,6 @@ bpfkqfilter(dev_t dev, struct knote *kn) kn->kn_hook = d; klist_insert_locked(klist, kn); - mtx_enter(&d->bd_mtx); - if (d->bd_rnonblock == 0 && d->bd_rdStart == 0) - d->bd_rdStart = ticks; - mtx_leave(&d->bd_mtx); - return (0); } @@ -1551,15 +1544,12 @@ bpf_catchpacket(struct bpf_d *d, u_char *pkt, size_t pktlen, size_t snaplen, do_wakeup = 1; } - if (d->bd_rdStart && d->bd_rdStart <= ULONG_MAX - d->bd_rtout && - d->bd_rdStart + d->bd_rtout < ticks) { + if (d->bd_nreaders > 0) { /* - * we could be selecting on the bpf, and we - * may have timeouts set. We got here by getting - * a packet, so wake up the reader. + * We have one or more threads sleeping in bpfread(). + * We got a packet, so wake up all readers. */ if (d->bd_fbuf != NULL) { - d->bd_rdStart = 0; ROTATE_BUFFERS(d); do_wakeup = 1; } -- cgit v1.2.3