From d8bf193ec09f26b570daa057ab9961744de1794d Mon Sep 17 00:00:00 2001 From: anton Date: Wed, 10 Jul 2019 16:43:21 +0000 Subject: Make read/write of the f_offset field belonging to struct file MP-safe; as part of the effort to unlock the kernel. Instead of relying on the vnode lock, introduce a dedicated lock per file. Exclusive write access is granted using the new foffset_enter and foffset_leave API. A convenience function foffset_get is also available for threads that only need to read the current offset. The lock acquisition order in vn_write has been changed to match the one in vn_read in order to avoid a potential deadlock. This change also gets rid of a documented race in vn_read(). Inspired by the FreeBSD implementation. With help and ok mpi@ visa@ --- sys/kern/kern_descrip.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++--- sys/kern/kern_sysctl.c | 4 +-- sys/kern/vfs_syscalls.c | 13 +++++--- sys/kern/vfs_vnops.c | 54 ++++++++++++++++++--------------- 4 files changed, 117 insertions(+), 35 deletions(-) (limited to 'sys/kern') diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c index e2e4f1c668a..2cf66bb04aa 100644 --- a/sys/kern/kern_descrip.c +++ b/sys/kern/kern_descrip.c @@ -1,4 +1,4 @@ -/* $OpenBSD: kern_descrip.c,v 1.188 2019/07/03 14:32:02 visa Exp $ */ +/* $OpenBSD: kern_descrip.c,v 1.189 2019/07/10 16:43:19 anton Exp $ */ /* $NetBSD: kern_descrip.c,v 1.42 1996/03/30 22:24:38 christos Exp $ */ /* @@ -532,12 +532,14 @@ restart: ktrflock(p, &fl); #endif if (fl.l_whence == SEEK_CUR) { + off_t offset = foffset_get(fp); + if (fl.l_start == 0 && fl.l_len < 0) { /* lockf(3) compliance hack */ fl.l_len = -fl.l_len; - fl.l_start = fp->f_offset - fl.l_len; + fl.l_start = offset - fl.l_len; } else - fl.l_start += fp->f_offset; + fl.l_start += offset; } switch (fl.l_type) { @@ -602,12 +604,14 @@ restart: if (error) break; if (fl.l_whence == SEEK_CUR) { + off_t offset = foffset_get(fp); + if (fl.l_start == 0 && fl.l_len < 0) { /* lockf(3) compliance hack */ fl.l_len = -fl.l_len; - fl.l_start = fp->f_offset - fl.l_len; + fl.l_start = offset - fl.l_len; } else - fl.l_start += fp->f_offset; + fl.l_start += offset; } if (fl.l_type != F_RDLCK && fl.l_type != F_WRLCK && @@ -1276,6 +1280,73 @@ fdrop(struct file *fp, struct proc *p) return (error); } +/* + * Get the file offset without keeping the same offset locked upon return. + */ +off_t +foffset_get(struct file *fp) +{ + off_t offset; + + mtx_enter(&fp->f_mtx); + offset = fp->f_offset; + mtx_leave(&fp->f_mtx); + return (offset); +} + +/* + * Acquire an exclusive lock of the file offset. The calling thread must call + * foffset_leave() once done. + */ +off_t +foffset_enter(struct file *fp) +{ + off_t offset; + + mtx_enter(&fp->f_mtx); + + while (fp->f_olock & FOL_LOCKED) { + KASSERT((fp->f_olock & FOL_NWAIT) < FOL_NWAIT); + fp->f_olock++; + msleep(&fp->f_olock, &fp->f_mtx, PLOCK, "foffset", 0); + KASSERT((fp->f_olock & FOL_NWAIT) > 0); + fp->f_olock--; + } + fp->f_olock |= FOL_LOCKED; + + offset = fp->f_offset; + + mtx_leave(&fp->f_mtx); + + return (offset); +} + +/* + * Write a new file offset and release the lock. The calling thread must already + * have acquired the lock using foffset_enter(). + * If FO_NOUPDATE is present in flags, only the lock is released and the offset + * remains unmodified. + */ +void +foffset_leave(struct file *fp, off_t offset, int flags) +{ + unsigned int nwait; + + mtx_enter(&fp->f_mtx); + + KASSERT(fp->f_olock & FOL_LOCKED); + + if ((flags & FO_NOUPDATE) == 0) + fp->f_offset = offset; + nwait = fp->f_olock & FOL_NWAIT; + fp->f_olock &= ~FOL_LOCKED; + + mtx_leave(&fp->f_mtx); + + if (nwait > 0) + wakeup_one(&fp->f_olock); +} + /* * Apply an advisory lock on a file descriptor. * diff --git a/sys/kern/kern_sysctl.c b/sys/kern/kern_sysctl.c index 07e85ba692e..06eb7803191 100644 --- a/sys/kern/kern_sysctl.c +++ b/sys/kern/kern_sysctl.c @@ -1,4 +1,4 @@ -/* $OpenBSD: kern_sysctl.c,v 1.360 2019/06/16 00:56:53 bluhm Exp $ */ +/* $OpenBSD: kern_sysctl.c,v 1.361 2019/07/10 16:43:19 anton Exp $ */ /* $NetBSD: kern_sysctl.c,v 1.17 1996/05/20 17:49:05 mrg Exp $ */ /*- @@ -1100,8 +1100,8 @@ fill_file(struct kinfo_file *kf, struct file *fp, struct filedesc *fdp, kf->f_usecount = 0; if (suser(p) == 0 || p->p_ucred->cr_uid == fp->f_cred->cr_uid) { - kf->f_offset = fp->f_offset; mtx_enter(&fp->f_mtx); + kf->f_offset = fp->f_offset; kf->f_rxfer = fp->f_rxfer; kf->f_rwfer = fp->f_wxfer; kf->f_seek = fp->f_seek; diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c index 37c2332537f..c334dc98a69 100644 --- a/sys/kern/vfs_syscalls.c +++ b/sys/kern/vfs_syscalls.c @@ -1,4 +1,4 @@ -/* $OpenBSD: vfs_syscalls.c,v 1.319 2019/06/19 16:55:51 deraadt Exp $ */ +/* $OpenBSD: vfs_syscalls.c,v 1.320 2019/07/10 16:43:19 anton Exp $ */ /* $NetBSD: vfs_syscalls.c,v 1.71 1996/04/23 10:29:02 mycroft Exp $ */ /* @@ -2999,6 +2999,7 @@ sys_getdents(struct proc *p, void *v, register_t *retval) struct uio auio; struct iovec aiov; size_t buflen; + off_t offset; int error, eofflag; buflen = SCARG(uap, buflen); @@ -3011,12 +3012,16 @@ sys_getdents(struct proc *p, void *v, register_t *retval) error = EBADF; goto bad; } - if (fp->f_offset < 0) { + + offset = foffset_enter(fp); + if (offset < 0) { + foffset_leave(fp, 0, FO_NOUPDATE); error = EINVAL; goto bad; } vp = fp->f_data; if (vp->v_type != VDIR) { + foffset_leave(fp, 0, FO_NOUPDATE); error = EINVAL; goto bad; } @@ -3029,10 +3034,10 @@ sys_getdents(struct proc *p, void *v, register_t *retval) auio.uio_procp = p; auio.uio_resid = buflen; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); - auio.uio_offset = fp->f_offset; + auio.uio_offset = offset; error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag); - fp->f_offset = auio.uio_offset; VOP_UNLOCK(vp); + foffset_leave(fp, auio.uio_offset, 0); if (error) goto bad; *retval = buflen - auio.uio_resid; diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c index 520126602c1..43b1dbe1c85 100644 --- a/sys/kern/vfs_vnops.c +++ b/sys/kern/vfs_vnops.c @@ -1,4 +1,4 @@ -/* $OpenBSD: vfs_vnops.c,v 1.99 2019/06/22 06:48:25 semarie Exp $ */ +/* $OpenBSD: vfs_vnops.c,v 1.100 2019/07/10 16:43:19 anton Exp $ */ /* $NetBSD: vfs_vnops.c,v 1.20 1996/02/04 02:18:41 christos Exp $ */ /* @@ -342,38 +342,35 @@ vn_read(struct file *fp, struct uio *uio, int fflags) size_t count = uio->uio_resid; off_t offset; int error; + int foflags = 0; KERNEL_LOCK(); - /* - * Check below can race. We can block on the vnode lock - * and resume with a different `fp->f_offset' value. - */ if ((fflags & FO_POSITION) == 0) - offset = fp->f_offset; + uio->uio_offset = offset = foffset_enter(fp); else offset = uio->uio_offset; /* no wrap around of offsets except on character devices */ if (vp->v_type != VCHR && count > LLONG_MAX - offset) { + foflags = FO_NOUPDATE; error = EINVAL; goto done; } if (vp->v_type == VDIR) { + foflags = FO_NOUPDATE; error = EISDIR; goto done; } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); - if ((fflags & FO_POSITION) == 0) - uio->uio_offset = fp->f_offset; error = VOP_READ(vp, uio, (fp->f_flag & FNONBLOCK) ? IO_NDELAY : 0, cred); - if ((fflags & FO_POSITION) == 0) - fp->f_offset += count - uio->uio_resid; VOP_UNLOCK(vp); done: + if ((fflags & FO_POSITION) == 0) + foffset_leave(fp, offset + (count - uio->uio_resid), foflags); KERNEL_UNLOCK(); return (error); } @@ -386,11 +383,15 @@ vn_write(struct file *fp, struct uio *uio, int fflags) { struct vnode *vp = fp->f_data; struct ucred *cred = fp->f_cred; + off_t offset; int error, ioflag = IO_UNIT; size_t count; KERNEL_LOCK(); + if ((fflags & FO_POSITION) == 0) + uio->uio_offset = offset = foffset_enter(fp); + /* note: pwrite/pwritev are unaffected by O_APPEND */ if (vp->v_type == VREG && (fp->f_flag & O_APPEND) && (fflags & FO_POSITION) == 0) @@ -401,17 +402,15 @@ vn_write(struct file *fp, struct uio *uio, int fflags) (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))) ioflag |= IO_SYNC; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); - if ((fflags & FO_POSITION) == 0) - uio->uio_offset = fp->f_offset; count = uio->uio_resid; error = VOP_WRITE(vp, uio, ioflag, cred); + VOP_UNLOCK(vp); if ((fflags & FO_POSITION) == 0) { if (ioflag & IO_APPEND) - fp->f_offset = uio->uio_offset; + foffset_leave(fp, uio->uio_offset, 0); else - fp->f_offset += count - uio->uio_resid; + foffset_leave(fp, offset + (count - uio->uio_resid), 0); } - VOP_UNLOCK(vp); KERNEL_UNLOCK(); return (error); @@ -509,7 +508,7 @@ vn_ioctl(struct file *fp, u_long com, caddr_t data, struct proc *p) error = VOP_GETATTR(vp, &vattr, p->p_ucred, p); if (error) return (error); - *(int *)data = vattr.va_size - fp->f_offset; + *(int *)data = vattr.va_size - foffset_get(fp); return (0); } if (com == FIONBIO || com == FIOASYNC) /* XXX */ @@ -601,7 +600,7 @@ vn_seek(struct file *fp, off_t *offset, int whence, struct proc *p) struct ucred *cred = p->p_ucred; struct vnode *vp = fp->f_data; struct vattr vattr; - off_t newoff; + off_t curoff, newoff; int error, special; if (vp->v_type == VFIFO) @@ -611,28 +610,35 @@ vn_seek(struct file *fp, off_t *offset, int whence, struct proc *p) else special = 0; + curoff = foffset_enter(fp); switch (whence) { case SEEK_CUR: - newoff = fp->f_offset + *offset; + newoff = curoff + *offset; break; case SEEK_END: error = VOP_GETATTR(vp, &vattr, cred, p); if (error) - return (error); + goto bad; newoff = *offset + (off_t)vattr.va_size; break; case SEEK_SET: newoff = *offset; break; default: - return (EINVAL); + error = EINVAL; + goto bad; } - if (!special) { - if (newoff < 0) - return(EINVAL); + if (!special && newoff < 0) { + error = EINVAL; + goto bad; } - fp->f_offset = *offset = newoff; + foffset_leave(fp, newoff, 0); + *offset = newoff; return (0); + +bad: + foffset_leave(fp, 0, FO_NOUPDATE); + return (error); } /* -- cgit v1.2.3