/* $OpenBSD: sys_generic.c,v 1.158 2024/08/12 19:32:05 anton Exp $ */ /* $NetBSD: sys_generic.c,v 1.24 1996/03/29 00:25:32 cgd Exp $ */ /* * Copyright (c) 1996 Theo de Raadt * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include /* * Debug values: * 1 - print implementation errors, things that should not happen. * 2 - print ppoll(2) information, somewhat verbose * 3 - print pselect(2) and ppoll(2) information, very verbose */ /* #define KQPOLL_DEBUG */ #ifdef KQPOLL_DEBUG int kqpoll_debug = 1; #define DPRINTFN(v, x...) if (kqpoll_debug > v) { \ printf("%s(%d): ", curproc->p_p->ps_comm, curproc->p_tid); \ printf(x); \ } #else #define DPRINTFN(v, x...) do {} while (0); #endif int pselregister(struct proc *, fd_set **, fd_set **, int, int *, int *); int pselcollect(struct proc *, struct kevent *, fd_set **, int *); void ppollregister(struct proc *, struct pollfd *, int, int *, int *); int ppollcollect(struct proc *, struct kevent *, struct pollfd *, u_int); int pollout(struct pollfd *, struct pollfd *, u_int); int dopselect(struct proc *, int, fd_set *, fd_set *, fd_set *, struct timespec *, const sigset_t *, register_t *); int doppoll(struct proc *, struct pollfd *, u_int, struct timespec *, const sigset_t *, register_t *); int iovec_copyin(const struct iovec *uiov, struct iovec **iovp, struct iovec *aiov, unsigned int iovcnt, size_t *residp) { #ifdef KTRACE struct proc *p = curproc; #endif struct iovec *iov; int error, i; size_t resid = 0; if (iovcnt > UIO_SMALLIOV) { if (iovcnt > IOV_MAX) return (EINVAL); iov = mallocarray(iovcnt, sizeof(*iov), M_IOV, M_WAITOK); } else if (iovcnt > 0) { iov = aiov; } else { return (EINVAL); } *iovp = iov; if ((error = copyin(uiov, iov, iovcnt * sizeof(*iov)))) return (error); #ifdef KTRACE if (KTRPOINT(p, KTR_STRUCT)) ktriovec(p, iov, iovcnt); #endif for (i = 0; i < iovcnt; i++) { resid += iov->iov_len; /* * Writes return ssize_t because -1 is returned on error. * Therefore we must restrict the length to SSIZE_MAX to * avoid garbage return values. Note that the addition is * guaranteed to not wrap because SSIZE_MAX * 2 < SIZE_MAX. */ if (iov->iov_len > SSIZE_MAX || resid > SSIZE_MAX) return (EINVAL); iov++; } if (residp != NULL) *residp = resid; return (0); } void iovec_free(struct iovec *iov, unsigned int iovcnt) { if (iovcnt > UIO_SMALLIOV) free(iov, M_IOV, iovcnt * sizeof(*iov)); } /* * Read system call. */ int sys_read(struct proc *p, void *v, register_t *retval) { struct sys_read_args /* { syscallarg(int) fd; syscallarg(void *) buf; syscallarg(size_t) nbyte; } */ *uap = v; struct iovec iov; struct uio auio; iov.iov_base = SCARG(uap, buf); iov.iov_len = SCARG(uap, nbyte); if (iov.iov_len > SSIZE_MAX) return (EINVAL); auio.uio_iov = &iov; auio.uio_iovcnt = 1; auio.uio_resid = iov.iov_len; return (dofilereadv(p, SCARG(uap, fd), &auio, 0, retval)); } /* * Scatter read system call. */ int sys_readv(struct proc *p, void *v, register_t *retval) { struct sys_readv_args /* { syscallarg(int) fd; syscallarg(const struct iovec *) iovp; syscallarg(int) iovcnt; } */ *uap = v; struct iovec aiov[UIO_SMALLIOV], *iov = NULL; int error, iovcnt = SCARG(uap, iovcnt); struct uio auio; size_t resid; error = iovec_copyin(SCARG(uap, iovp), &iov, aiov, iovcnt, &resid); if (error) goto done; auio.uio_iov = iov; auio.uio_iovcnt = iovcnt; auio.uio_resid = resid; error = dofilereadv(p, SCARG(uap, fd), &auio, 0, retval); done: iovec_free(iov, iovcnt); return (error); } int dofilereadv(struct proc *p, int fd, struct uio *uio, int flags, register_t *retval) { struct filedesc *fdp = p->p_fd; struct file *fp; long cnt, error = 0; u_int iovlen; #ifdef KTRACE struct iovec *ktriov = NULL; #endif KASSERT(uio->uio_iov != NULL && uio->uio_iovcnt > 0); iovlen = uio->uio_iovcnt * sizeof(struct iovec); if ((fp = fd_getfile_mode(fdp, fd, FREAD)) == NULL) return (EBADF); /* Checks for positioned read. */ if (flags & FO_POSITION) { struct vnode *vp = fp->f_data; if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO || (vp->v_flag & VISTTY)) { error = ESPIPE; goto done; } if (uio->uio_offset < 0 && vp->v_type != VCHR) { error = EINVAL; goto done; } } uio->uio_rw = UIO_READ; uio->uio_segflg = UIO_USERSPACE; uio->uio_procp = p; #ifdef KTRACE /* * if tracing, save a copy of iovec */ if (KTRPOINT(p, KTR_GENIO)) { ktriov = malloc(iovlen, M_TEMP, M_WAITOK); memcpy(ktriov, uio->uio_iov, iovlen); } #endif cnt = uio->uio_resid; error = (*fp->f_ops->fo_read)(fp, uio, flags); if (error) { if (uio->uio_resid != cnt && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; } cnt -= uio->uio_resid; mtx_enter(&fp->f_mtx); fp->f_rxfer++; fp->f_rbytes += cnt; mtx_leave(&fp->f_mtx); #ifdef KTRACE if (ktriov != NULL) { if (error == 0) ktrgenio(p, fd, UIO_READ, ktriov, cnt); free(ktriov, M_TEMP, iovlen); } #endif *retval = cnt; done: FRELE(fp, p); return (error); } /* * Write system call */ int sys_write(struct proc *p, void *v, register_t *retval) { struct sys_write_args /* { syscallarg(int) fd; syscallarg(const void *) buf; syscallarg(size_t) nbyte; } */ *uap = v; struct iovec iov; struct uio auio; iov.iov_base = (void *)SCARG(uap, buf); iov.iov_len = SCARG(uap, nbyte); if (iov.iov_len > SSIZE_MAX) return (EINVAL); auio.uio_iov = &iov; auio.uio_iovcnt = 1; auio.uio_resid = iov.iov_len; return (dofilewritev(p, SCARG(uap, fd), &auio, 0, retval)); } /* * Gather write system call */ int sys_writev(struct proc *p, void *v, register_t *retval) { struct sys_writev_args /* { syscallarg(int) fd; syscallarg(const struct iovec *) iovp; syscallarg(int) iovcnt; } */ *uap = v; struct iovec aiov[UIO_SMALLIOV], *iov = NULL; int error, iovcnt = SCARG(uap, iovcnt); struct uio auio; size_t resid; error = iovec_copyin(SCARG(uap, iovp), &iov, aiov, iovcnt, &resid); if (error) goto done; auio.uio_iov = iov; auio.uio_iovcnt = iovcnt; auio.uio_resid = resid; error = dofilewritev(p, SCARG(uap, fd), &auio, 0, retval); done: iovec_free(iov, iovcnt); return (error); } int dofilewritev(struct proc *p, int fd, struct uio *uio, int flags, register_t *retval) { struct filedesc *fdp = p->p_fd; struct file *fp; long cnt, error = 0; u_int iovlen; #ifdef KTRACE struct iovec *ktriov = NULL; #endif KASSERT(uio->uio_iov != NULL && uio->uio_iovcnt > 0); iovlen = uio->uio_iovcnt * sizeof(struct iovec); if ((fp = fd_getfile_mode(fdp, fd, FWRITE)) == NULL) return (EBADF); /* Checks for positioned write. */ if (flags & FO_POSITION) { struct vnode *vp = fp->f_data; if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO || (vp->v_flag & VISTTY)) { error = ESPIPE; goto done; } if (uio->uio_offset < 0 && vp->v_type != VCHR) { error = EINVAL; goto done; } } uio->uio_rw = UIO_WRITE; uio->uio_segflg = UIO_USERSPACE; uio->uio_procp = p; #ifdef KTRACE /* * if tracing, save a copy of iovec */ if (KTRPOINT(p, KTR_GENIO)) { ktriov = malloc(iovlen, M_TEMP, M_WAITOK); memcpy(ktriov, uio->uio_iov, iovlen); } #endif cnt = uio->uio_resid; error = (*fp->f_ops->fo_write)(fp, uio, flags); if (error) { if (uio->uio_resid != cnt && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; if (error == EPIPE) { KERNEL_LOCK(); ptsignal(p, SIGPIPE, STHREAD); KERNEL_UNLOCK(); } } cnt -= uio->uio_resid; mtx_enter(&fp->f_mtx); fp->f_wxfer++; fp->f_wbytes += cnt; mtx_leave(&fp->f_mtx); #ifdef KTRACE if (ktriov != NULL) { if (error == 0) ktrgenio(p, fd, UIO_WRITE, ktriov, cnt); free(ktriov, M_TEMP, iovlen); } #endif *retval = cnt; done: FRELE(fp, p); return (error); } /* * Ioctl system call */ int sys_ioctl(struct proc *p, void *v, register_t *retval) { struct sys_ioctl_args /* { syscallarg(int) fd; syscallarg(u_long) com; syscallarg(void *) data; } */ *uap = v; struct file *fp; struct filedesc *fdp = p->p_fd; u_long com = SCARG(uap, com); int error = 0; u_int size = 0; caddr_t data, memp = NULL; int tmp; #define STK_PARAMS 128 long long stkbuf[STK_PARAMS / sizeof(long long)]; if ((fp = fd_getfile_mode(fdp, SCARG(uap, fd), FREAD|FWRITE)) == NULL) return (EBADF); if (fp->f_type == DTYPE_SOCKET) { struct socket *so = fp->f_data; if (so->so_state & SS_DNS) { error = EINVAL; goto out; } } error = pledge_ioctl(p, com, fp); if (error) goto out; switch (com) { case FIONCLEX: case FIOCLEX: fdplock(fdp); if (com == FIONCLEX) fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE; else fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE; fdpunlock(fdp); goto out; } /* * Interpret high order word to find amount of data to be * copied to/from the user's address space. */ size = IOCPARM_LEN(com); if (size > IOCPARM_MAX) { error = ENOTTY; goto out; } if (size > sizeof (stkbuf)) { memp = malloc(size, M_IOCTLOPS, M_WAITOK); data = memp; } else data = (caddr_t)stkbuf; if (com&IOC_IN) { if (size) { error = copyin(SCARG(uap, data), data, size); if (error) { goto out; } } else *(caddr_t *)data = SCARG(uap, data); } else if ((com&IOC_OUT) && size) /* * Zero the buffer so the user always * gets back something deterministic. */ memset(data, 0, size); else if (com&IOC_VOID) *(caddr_t *)data = SCARG(uap, data); switch (com) { case FIONBIO: if ((tmp = *(int *)data) != 0) atomic_setbits_int(&fp->f_flag, FNONBLOCK); else atomic_clearbits_int(&fp->f_flag, FNONBLOCK); error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p); break; case FIOASYNC: if ((tmp = *(int *)data) != 0) atomic_setbits_int(&fp->f_flag, FASYNC); else atomic_clearbits_int(&fp->f_flag, FASYNC); error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p); break; default: error = (*fp->f_ops->fo_ioctl)(fp, com, data, p); break; } /* * Copy any data to user, size was * already set and checked above. */ if (error == 0 && (com&IOC_OUT) && size) error = copyout(data, SCARG(uap, data), size); out: FRELE(fp, p); free(memp, M_IOCTLOPS, size); return (error); } /* * Select system call. */ int sys_select(struct proc *p, void *v, register_t *retval) { struct sys_select_args /* { syscallarg(int) nd; syscallarg(fd_set *) in; syscallarg(fd_set *) ou; syscallarg(fd_set *) ex; syscallarg(struct timeval *) tv; } */ *uap = v; struct timespec ts, *tsp = NULL; int error; if (SCARG(uap, tv) != NULL) { struct timeval tv; if ((error = copyin(SCARG(uap, tv), &tv, sizeof tv)) != 0) return (error); #ifdef KTRACE if (KTRPOINT(p, KTR_STRUCT)) ktrreltimeval(p, &tv); #endif if (tv.tv_sec < 0 || !timerisvalid(&tv)) return (EINVAL); TIMEVAL_TO_TIMESPEC(&tv, &ts); tsp = &ts; } return (dopselect(p, SCARG(uap, nd), SCARG(uap, in), SCARG(uap, ou), SCARG(uap, ex), tsp, NULL, retval)); } int sys_pselect(struct proc *p, void *v, register_t *retval) { struct sys_pselect_args /* { syscallarg(int) nd; syscallarg(fd_set *) in; syscallarg(fd_set *) ou; syscallarg(fd_set *) ex; syscallarg(const struct timespec *) ts; syscallarg(const sigset_t *) mask; } */ *uap = v; struct timespec ts, *tsp = NULL; sigset_t ss, *ssp = NULL; int error; if (SCARG(uap, ts) != NULL) { if ((error = copyin(SCARG(uap, ts), &ts, sizeof ts)) != 0) return (error); #ifdef KTRACE if (KTRPOINT(p, KTR_STRUCT)) ktrreltimespec(p, &ts); #endif if (ts.tv_sec < 0 || !timespecisvalid(&ts)) return (EINVAL); tsp = &ts; } if (SCARG(uap, mask) != NULL) { if ((error = copyin(SCARG(uap, mask), &ss, sizeof ss)) != 0) return (error); ssp = &ss; } return (dopselect(p, SCARG(uap, nd), SCARG(uap, in), SCARG(uap, ou), SCARG(uap, ex), tsp, ssp, retval)); } int dopselect(struct proc *p, int nd, fd_set *in, fd_set *ou, fd_set *ex, struct timespec *timeout, const sigset_t *sigmask, register_t *retval) { struct kqueue_scan_state scan; struct timespec zerots = {}; fd_mask bits[6]; fd_set *pibits[3], *pobits[3]; int error, nfiles, ncollected = 0, nevents = 0; u_int ni; if (nd < 0) return (EINVAL); nfiles = READ_ONCE(p->p_fd->fd_nfiles); if (nd > nfiles) nd = nfiles; ni = howmany(nd, NFDBITS) * sizeof(fd_mask); if (ni > sizeof(bits[0])) { caddr_t mbits; mbits = mallocarray(6, ni, M_TEMP, M_WAITOK|M_ZERO); pibits[0] = (fd_set *)&mbits[ni * 0]; pibits[1] = (fd_set *)&mbits[ni * 1]; pibits[2] = (fd_set *)&mbits[ni * 2]; pobits[0] = (fd_set *)&mbits[ni * 3]; pobits[1] = (fd_set *)&mbits[ni * 4]; pobits[2] = (fd_set *)&mbits[ni * 5]; } else { memset(bits, 0, sizeof(bits)); pibits[0] = (fd_set *)&bits[0]; pibits[1] = (fd_set *)&bits[1]; pibits[2] = (fd_set *)&bits[2]; pobits[0] = (fd_set *)&bits[3]; pobits[1] = (fd_set *)&bits[4]; pobits[2] = (fd_set *)&bits[5]; } kqpoll_init(nd); #define getbits(name, x) \ if (name && (error = copyin(name, pibits[x], ni))) \ goto done; getbits(in, 0); getbits(ou, 1); getbits(ex, 2); #undef getbits #ifdef KTRACE if (ni > 0 && KTRPOINT(p, KTR_STRUCT)) { if (in) ktrfdset(p, pibits[0], ni); if (ou) ktrfdset(p, pibits[1], ni); if (ex) ktrfdset(p, pibits[2], ni); } #endif if (sigmask) dosigsuspend(p, *sigmask &~ sigcantmask); /* Register kqueue events */ error = pselregister(p, pibits, pobits, nd, &nevents, &ncollected); if (error != 0) goto done; /* * The poll/select family of syscalls has been designed to * block when file descriptors are not available, even if * there's nothing to wait for. */ if (nevents == 0 && ncollected == 0) { uint64_t nsecs = INFSLP; if (timeout != NULL) { if (!timespecisset(timeout)) goto done; nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP)); } error = tsleep_nsec(&nowake, PSOCK | PCATCH, "kqsel", nsecs); /* select is not restarted after signals... */ if (error == ERESTART) error = EINTR; if (error == EWOULDBLOCK) error = 0; goto done; } /* Do not block if registering found pending events. */ if (ncollected > 0) timeout = &zerots; /* Collect at most `nevents' possibly waiting in kqueue_scan() */ kqueue_scan_setup(&scan, p->p_kq); while (nevents > 0) { struct kevent kev[KQ_NEVENTS]; int i, ready, count; /* Maximum number of events per iteration */ count = MIN(nitems(kev), nevents); ready = kqueue_scan(&scan, count, kev, timeout, p, &error); /* Convert back events that are ready. */ for (i = 0; i < ready && error == 0; i++) error = pselcollect(p, &kev[i], pobits, &ncollected); /* * Stop if there was an error or if we had enough * space to collect all events that were ready. */ if (error || ready < count) break; nevents -= ready; } kqueue_scan_finish(&scan); *retval = ncollected; done: #define putbits(name, x) \ if (name && (error2 = copyout(pobits[x], name, ni))) \ error = error2; if (error == 0) { int error2; putbits(in, 0); putbits(ou, 1); putbits(ex, 2); #undef putbits #ifdef KTRACE if (ni > 0 && KTRPOINT(p, KTR_STRUCT)) { if (in) ktrfdset(p, pobits[0], ni); if (ou) ktrfdset(p, pobits[1], ni); if (ex) ktrfdset(p, pobits[2], ni); } #endif } if (pibits[0] != (fd_set *)&bits[0]) free(pibits[0], M_TEMP, 6 * ni); kqpoll_done(nd); return (error); } /* * Convert fd_set into kqueue events and register them on the * per-thread queue. */ int pselregister(struct proc *p, fd_set *pibits[3], fd_set *pobits[3], int nfd, int *nregistered, int *ncollected) { static const int evf[] = { EVFILT_READ, EVFILT_WRITE, EVFILT_EXCEPT }; static const int evff[] = { 0, 0, NOTE_OOB }; int msk, i, j, fd, nevents = 0, error = 0; struct kevent kev; fd_mask bits; for (msk = 0; msk < 3; msk++) { for (i = 0; i < nfd; i += NFDBITS) { bits = pibits[msk]->fds_bits[i / NFDBITS]; while ((j = ffs(bits)) && (fd = i + --j) < nfd) { bits &= ~(1 << j); DPRINTFN(2, "select fd %d mask %d serial %lu\n", fd, msk, p->p_kq_serial); EV_SET(&kev, fd, evf[msk], EV_ADD|EV_ENABLE|__EV_SELECT, evff[msk], 0, (void *)(p->p_kq_serial)); error = kqueue_register(p->p_kq, &kev, 0, p); switch (error) { case 0: nevents++; /* FALLTHROUGH */ case EOPNOTSUPP:/* No underlying kqfilter */ case EINVAL: /* Unimplemented filter */ case EPERM: /* Specific to FIFO and * __EV_SELECT */ error = 0; break; case ENXIO: /* Device has been detached */ default: goto bad; } } } } *nregistered = nevents; return (0); bad: DPRINTFN(0, "select fd %u filt %d error %d\n", (int)kev.ident, kev.filter, error); return (error); } /* * Convert given kqueue event into corresponding select(2) bit. */ int pselcollect(struct proc *p, struct kevent *kevp, fd_set *pobits[3], int *ncollected) { if ((unsigned long)kevp->udata != p->p_kq_serial) { panic("%s: spurious kevp %p fd %d udata 0x%lx serial 0x%lx", __func__, kevp, (int)kevp->ident, (unsigned long)kevp->udata, p->p_kq_serial); } if (kevp->flags & EV_ERROR) { DPRINTFN(2, "select fd %d filt %d error %d\n", (int)kevp->ident, kevp->filter, (int)kevp->data); return (kevp->data); } switch (kevp->filter) { case EVFILT_READ: FD_SET(kevp->ident, pobits[0]); break; case EVFILT_WRITE: FD_SET(kevp->ident, pobits[1]); break; case EVFILT_EXCEPT: FD_SET(kevp->ident, pobits[2]); break; default: KASSERT(0); } (*ncollected)++; DPRINTFN(2, "select fd %d filt %d\n", (int)kevp->ident, kevp->filter); return (0); } /* * Do a wakeup when a selectable event occurs. */ void selwakeup(struct selinfo *sip) { KERNEL_LOCK(); knote_locked(&sip->si_note, NOTE_SUBMIT); KERNEL_UNLOCK(); } /* * Only copyout the revents field. */ int pollout(struct pollfd *pl, struct pollfd *upl, u_int nfds) { int error = 0; u_int i = 0; while (!error && i++ < nfds) { error = copyout(&pl->revents, &upl->revents, sizeof(upl->revents)); pl++; upl++; } return (error); } /* * We are using the same mechanism as select only we encode/decode args * differently. */ int sys_poll(struct proc *p, void *v, register_t *retval) { struct sys_poll_args /* { syscallarg(struct pollfd *) fds; syscallarg(u_int) nfds; syscallarg(int) timeout; } */ *uap = v; struct timespec ts, *tsp = NULL; int msec = SCARG(uap, timeout); if (msec != INFTIM) { if (msec < 0) return (EINVAL); ts.tv_sec = msec / 1000; ts.tv_nsec = (msec - (ts.tv_sec * 1000)) * 1000000; tsp = &ts; } return (doppoll(p, SCARG(uap, fds), SCARG(uap, nfds), tsp, NULL, retval)); } int sys_ppoll(struct proc *p, void *v, register_t *retval) { struct sys_ppoll_args /* { syscallarg(struct pollfd *) fds; syscallarg(u_int) nfds; syscallarg(const struct timespec *) ts; syscallarg(const sigset_t *) mask; } */ *uap = v; int error; struct timespec ts, *tsp = NULL; sigset_t ss, *ssp = NULL; if (SCARG(uap, ts) != NULL) { if ((error = copyin(SCARG(uap, ts), &ts, sizeof ts)) != 0) return (error); #ifdef KTRACE if (KTRPOINT(p, KTR_STRUCT)) ktrreltimespec(p, &ts); #endif if (ts.tv_sec < 0 || !timespecisvalid(&ts)) return (EINVAL); tsp = &ts; } if (SCARG(uap, mask) != NULL) { if ((error = copyin(SCARG(uap, mask), &ss, sizeof ss)) != 0) return (error); ssp = &ss; } return (doppoll(p, SCARG(uap, fds), SCARG(uap, nfds), tsp, ssp, retval)); } int doppoll(struct proc *p, struct pollfd *fds, u_int nfds, struct timespec *timeout, const sigset_t *sigmask, register_t *retval) { struct kqueue_scan_state scan; struct timespec zerots = {}; struct pollfd pfds[4], *pl = pfds; int error, ncollected = 0, nevents = 0; size_t sz; /* Standards say no more than MAX_OPEN; this is possibly better. */ if (nfds > min((int)lim_cur(RLIMIT_NOFILE), maxfiles)) return (EINVAL); /* optimize for the default case, of a small nfds value */ if (nfds > nitems(pfds)) { pl = mallocarray(nfds, sizeof(*pl), M_TEMP, M_WAITOK | M_CANFAIL); if (pl == NULL) return (EINVAL); } kqpoll_init(nfds); sz = nfds * sizeof(*pl); if ((error = copyin(fds, pl, sz)) != 0) goto bad; if (sigmask) dosigsuspend(p, *sigmask &~ sigcantmask); /* Register kqueue events */ ppollregister(p, pl, nfds, &nevents, &ncollected); /* * The poll/select family of syscalls has been designed to * block when file descriptors are not available, even if * there's nothing to wait for. */ if (nevents == 0 && ncollected == 0) { uint64_t nsecs = INFSLP; if (timeout != NULL) { if (!timespecisset(timeout)) goto done; nsecs = MAX(1, MIN(TIMESPEC_TO_NSEC(timeout), MAXTSLP)); } error = tsleep_nsec(&nowake, PSOCK | PCATCH, "kqpoll", nsecs); if (error == ERESTART) error = EINTR; if (error == EWOULDBLOCK) error = 0; goto done; } /* Do not block if registering found pending events. */ if (ncollected > 0) timeout = &zerots; /* Collect at most `nevents' possibly waiting in kqueue_scan() */ kqueue_scan_setup(&scan, p->p_kq); while (nevents > 0) { struct kevent kev[KQ_NEVENTS]; int i, ready, count; /* Maximum number of events per iteration */ count = MIN(nitems(kev), nevents); ready = kqueue_scan(&scan, count, kev, timeout, p, &error); /* Convert back events that are ready. */ for (i = 0; i < ready; i++) ncollected += ppollcollect(p, &kev[i], pl, nfds); /* * Stop if there was an error or if we had enough * place to collect all events that were ready. */ if (error || ready < count) break; nevents -= ready; } kqueue_scan_finish(&scan); *retval = ncollected; done: /* * NOTE: poll(2) is not restarted after a signal and EWOULDBLOCK is * ignored (since the whole point is to see what would block). */ switch (error) { case EINTR: error = pollout(pl, fds, nfds); if (error == 0) error = EINTR; break; case EWOULDBLOCK: case 0: error = pollout(pl, fds, nfds); break; } #ifdef KTRACE if (KTRPOINT(p, KTR_STRUCT)) ktrpollfd(p, pl, nfds); #endif /* KTRACE */ bad: if (pl != pfds) free(pl, M_TEMP, sz); kqpoll_done(nfds); return (error); } int ppollregister_evts(struct proc *p, struct kevent *kevp, int nkev, struct pollfd *pl, unsigned int pollid) { int i, error, nevents = 0; KASSERT(pl->revents == 0); for (i = 0; i < nkev; i++, kevp++) { again: error = kqueue_register(p->p_kq, kevp, pollid, p); switch (error) { case 0: nevents++; break; case EOPNOTSUPP:/* No underlying kqfilter */ case EINVAL: /* Unimplemented filter */ break; case EBADF: /* Bad file descriptor */ pl->revents |= POLLNVAL; break; case EPERM: /* Specific to FIFO */ KASSERT(kevp->filter == EVFILT_WRITE); if (nkev == 1) { /* * If this is the only filter make sure * POLLHUP is passed to userland. */ kevp->filter = EVFILT_EXCEPT; goto again; } break; default: DPRINTFN(0, "poll err %lu fd %d revents %02x serial" " %lu filt %d ERROR=%d\n", ((unsigned long)kevp->udata - p->p_kq_serial), pl->fd, pl->revents, p->p_kq_serial, kevp->filter, error); /* FALLTHROUGH */ case ENXIO: /* Device has been detached */ pl->revents |= POLLERR; break; } } return (nevents); } /* * Convert pollfd into kqueue events and register them on the * per-thread queue. * * At most 3 events can correspond to a single pollfd. */ void ppollregister(struct proc *p, struct pollfd *pl, int nfds, int *nregistered, int *ncollected) { int i, nkev, nevt, forcehup; struct kevent kev[3], *kevp; for (i = 0; i < nfds; i++) { pl[i].events &= ~POLL_NOHUP; pl[i].revents = 0; if (pl[i].fd < 0) continue; /* * POLLHUP checking is implicit in the event filters. * However, the checking must be even if no events are * requested. */ forcehup = ((pl[i].events & ~POLLHUP) == 0); DPRINTFN(1, "poll set %d/%d fd %d events %02x serial %lu\n", i+1, nfds, pl[i].fd, pl[i].events, p->p_kq_serial); nevt = 0; nkev = 0; kevp = kev; if (pl[i].events & (POLLIN | POLLRDNORM)) { EV_SET(kevp, pl[i].fd, EVFILT_READ, EV_ADD|EV_ENABLE|__EV_POLL, 0, 0, (void *)(p->p_kq_serial + i)); nkev++; kevp++; } if (pl[i].events & (POLLOUT | POLLWRNORM)) { EV_SET(kevp, pl[i].fd, EVFILT_WRITE, EV_ADD|EV_ENABLE|__EV_POLL, 0, 0, (void *)(p->p_kq_serial + i)); nkev++; kevp++; } if ((pl[i].events & (POLLPRI | POLLRDBAND)) || forcehup) { int evff = forcehup ? 0 : NOTE_OOB; EV_SET(kevp, pl[i].fd, EVFILT_EXCEPT, EV_ADD|EV_ENABLE|__EV_POLL, evff, 0, (void *)(p->p_kq_serial + i)); nkev++; kevp++; } if (nkev == 0) continue; *nregistered += ppollregister_evts(p, kev, nkev, &pl[i], i); if (pl[i].revents != 0) (*ncollected)++; } DPRINTFN(1, "poll registered = %d, collected = %d\n", *nregistered, *ncollected); } /* * Convert given kqueue event into corresponding poll(2) revents bit. */ int ppollcollect(struct proc *p, struct kevent *kevp, struct pollfd *pl, u_int nfds) { static struct timeval poll_errintvl = { 5, 0 }; static struct timeval poll_lasterr; int already_seen; unsigned long i; /* Extract poll array index */ i = (unsigned long)kevp->udata - p->p_kq_serial; if (i >= nfds) { panic("%s: spurious kevp %p nfds %u udata 0x%lx serial 0x%lx", __func__, kevp, nfds, (unsigned long)kevp->udata, p->p_kq_serial); } if ((int)kevp->ident != pl[i].fd) { panic("%s: kevp %p %lu/%d mismatch fd %d!=%d serial 0x%lx", __func__, kevp, i + 1, nfds, (int)kevp->ident, pl[i].fd, p->p_kq_serial); } /* * A given descriptor may already have generated an error * against another filter during kqueue_register(). * * Make sure to set the appropriate flags but do not * increment `*retval' more than once. */ already_seen = (pl[i].revents != 0); /* POLLNVAL preempts other events. */ if ((kevp->flags & EV_ERROR) && kevp->data == EBADF) { pl[i].revents = POLLNVAL; goto done; } else if (pl[i].revents & POLLNVAL) { goto done; } switch (kevp->filter) { case EVFILT_READ: if (kevp->flags & __EV_HUP) pl[i].revents |= POLLHUP; if (pl[i].events & (POLLIN | POLLRDNORM)) pl[i].revents |= pl[i].events & (POLLIN | POLLRDNORM); break; case EVFILT_WRITE: /* POLLHUP and POLLOUT/POLLWRNORM are mutually exclusive */ if (kevp->flags & __EV_HUP) { pl[i].revents |= POLLHUP; } else if (pl[i].events & (POLLOUT | POLLWRNORM)) { pl[i].revents |= pl[i].events & (POLLOUT | POLLWRNORM); } break; case EVFILT_EXCEPT: if (kevp->flags & __EV_HUP) { if (pl[i].events != 0 && pl[i].events != POLLOUT) DPRINTFN(0, "weird events %x\n", pl[i].events); pl[i].revents |= POLLHUP; break; } if (pl[i].events & (POLLPRI | POLLRDBAND)) pl[i].revents |= pl[i].events & (POLLPRI | POLLRDBAND); break; default: KASSERT(0); } done: DPRINTFN(1, "poll get %lu/%d fd %d revents %02x serial %lu filt %d\n", i+1, nfds, pl[i].fd, pl[i].revents, (unsigned long)kevp->udata, kevp->filter); /* * Make noise about unclaimed events as they might indicate a bug * and can result in spurious-looking wakeups of poll(2). * * Live-locking within the system call should not happen because * the scan loop in doppoll() has an upper limit for the number * of events to process. */ if (pl[i].revents == 0 && ratecheck(&poll_lasterr, &poll_errintvl)) { printf("%s[%d]: poll index %lu fd %d events 0x%x " "filter %d/0x%x unclaimed\n", p->p_p->ps_comm, p->p_tid, i, pl[i].fd, pl[i].events, kevp->filter, kevp->flags); } if (!already_seen && (pl[i].revents != 0)) return (1); return (0); } /* * utrace system call */ int sys_utrace(struct proc *curp, void *v, register_t *retval) { #ifdef KTRACE struct sys_utrace_args /* { syscallarg(const char *) label; syscallarg(const void *) addr; syscallarg(size_t) len; } */ *uap = v; return (ktruser(curp, SCARG(uap, label), SCARG(uap, addr), SCARG(uap, len))); #else return (0); #endif }