summaryrefslogtreecommitdiff
path: root/sys/kern
diff options
context:
space:
mode:
authoranton <anton@cvs.openbsd.org>2019-07-10 16:43:21 +0000
committeranton <anton@cvs.openbsd.org>2019-07-10 16:43:21 +0000
commitd8bf193ec09f26b570daa057ab9961744de1794d (patch)
treefb42455d3d5d0bb3236d5a9551cade60758d5ad7 /sys/kern
parentcfb2b7e76d2d97505ffb3e2ac6a43561e980f07a (diff)
Make read/write of the f_offset field belonging to struct file MP-safe;
as part of the effort to unlock the kernel. Instead of relying on the vnode lock, introduce a dedicated lock per file. Exclusive write access is granted using the new foffset_enter and foffset_leave API. A convenience function foffset_get is also available for threads that only need to read the current offset. The lock acquisition order in vn_write has been changed to match the one in vn_read in order to avoid a potential deadlock. This change also gets rid of a documented race in vn_read(). Inspired by the FreeBSD implementation. With help and ok mpi@ visa@
Diffstat (limited to 'sys/kern')
-rw-r--r--sys/kern/kern_descrip.c81
-rw-r--r--sys/kern/kern_sysctl.c4
-rw-r--r--sys/kern/vfs_syscalls.c13
-rw-r--r--sys/kern/vfs_vnops.c54
4 files changed, 117 insertions, 35 deletions
diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c
index e2e4f1c668a..2cf66bb04aa 100644
--- a/sys/kern/kern_descrip.c
+++ b/sys/kern/kern_descrip.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: kern_descrip.c,v 1.188 2019/07/03 14:32:02 visa Exp $ */
+/* $OpenBSD: kern_descrip.c,v 1.189 2019/07/10 16:43:19 anton Exp $ */
/* $NetBSD: kern_descrip.c,v 1.42 1996/03/30 22:24:38 christos Exp $ */
/*
@@ -532,12 +532,14 @@ restart:
ktrflock(p, &fl);
#endif
if (fl.l_whence == SEEK_CUR) {
+ off_t offset = foffset_get(fp);
+
if (fl.l_start == 0 && fl.l_len < 0) {
/* lockf(3) compliance hack */
fl.l_len = -fl.l_len;
- fl.l_start = fp->f_offset - fl.l_len;
+ fl.l_start = offset - fl.l_len;
} else
- fl.l_start += fp->f_offset;
+ fl.l_start += offset;
}
switch (fl.l_type) {
@@ -602,12 +604,14 @@ restart:
if (error)
break;
if (fl.l_whence == SEEK_CUR) {
+ off_t offset = foffset_get(fp);
+
if (fl.l_start == 0 && fl.l_len < 0) {
/* lockf(3) compliance hack */
fl.l_len = -fl.l_len;
- fl.l_start = fp->f_offset - fl.l_len;
+ fl.l_start = offset - fl.l_len;
} else
- fl.l_start += fp->f_offset;
+ fl.l_start += offset;
}
if (fl.l_type != F_RDLCK &&
fl.l_type != F_WRLCK &&
@@ -1277,6 +1281,73 @@ fdrop(struct file *fp, struct proc *p)
}
/*
+ * Get the file offset without keeping the same offset locked upon return.
+ */
+off_t
+foffset_get(struct file *fp)
+{
+ off_t offset;
+
+ mtx_enter(&fp->f_mtx);
+ offset = fp->f_offset;
+ mtx_leave(&fp->f_mtx);
+ return (offset);
+}
+
+/*
+ * Acquire an exclusive lock of the file offset. The calling thread must call
+ * foffset_leave() once done.
+ */
+off_t
+foffset_enter(struct file *fp)
+{
+ off_t offset;
+
+ mtx_enter(&fp->f_mtx);
+
+ while (fp->f_olock & FOL_LOCKED) {
+ KASSERT((fp->f_olock & FOL_NWAIT) < FOL_NWAIT);
+ fp->f_olock++;
+ msleep(&fp->f_olock, &fp->f_mtx, PLOCK, "foffset", 0);
+ KASSERT((fp->f_olock & FOL_NWAIT) > 0);
+ fp->f_olock--;
+ }
+ fp->f_olock |= FOL_LOCKED;
+
+ offset = fp->f_offset;
+
+ mtx_leave(&fp->f_mtx);
+
+ return (offset);
+}
+
+/*
+ * Write a new file offset and release the lock. The calling thread must already
+ * have acquired the lock using foffset_enter().
+ * If FO_NOUPDATE is present in flags, only the lock is released and the offset
+ * remains unmodified.
+ */
+void
+foffset_leave(struct file *fp, off_t offset, int flags)
+{
+ unsigned int nwait;
+
+ mtx_enter(&fp->f_mtx);
+
+ KASSERT(fp->f_olock & FOL_LOCKED);
+
+ if ((flags & FO_NOUPDATE) == 0)
+ fp->f_offset = offset;
+ nwait = fp->f_olock & FOL_NWAIT;
+ fp->f_olock &= ~FOL_LOCKED;
+
+ mtx_leave(&fp->f_mtx);
+
+ if (nwait > 0)
+ wakeup_one(&fp->f_olock);
+}
+
+/*
* Apply an advisory lock on a file descriptor.
*
* Just attempt to get a record lock of the requested type on
diff --git a/sys/kern/kern_sysctl.c b/sys/kern/kern_sysctl.c
index 07e85ba692e..06eb7803191 100644
--- a/sys/kern/kern_sysctl.c
+++ b/sys/kern/kern_sysctl.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: kern_sysctl.c,v 1.360 2019/06/16 00:56:53 bluhm Exp $ */
+/* $OpenBSD: kern_sysctl.c,v 1.361 2019/07/10 16:43:19 anton Exp $ */
/* $NetBSD: kern_sysctl.c,v 1.17 1996/05/20 17:49:05 mrg Exp $ */
/*-
@@ -1100,8 +1100,8 @@ fill_file(struct kinfo_file *kf, struct file *fp, struct filedesc *fdp,
kf->f_usecount = 0;
if (suser(p) == 0 || p->p_ucred->cr_uid == fp->f_cred->cr_uid) {
- kf->f_offset = fp->f_offset;
mtx_enter(&fp->f_mtx);
+ kf->f_offset = fp->f_offset;
kf->f_rxfer = fp->f_rxfer;
kf->f_rwfer = fp->f_wxfer;
kf->f_seek = fp->f_seek;
diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c
index 37c2332537f..c334dc98a69 100644
--- a/sys/kern/vfs_syscalls.c
+++ b/sys/kern/vfs_syscalls.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: vfs_syscalls.c,v 1.319 2019/06/19 16:55:51 deraadt Exp $ */
+/* $OpenBSD: vfs_syscalls.c,v 1.320 2019/07/10 16:43:19 anton Exp $ */
/* $NetBSD: vfs_syscalls.c,v 1.71 1996/04/23 10:29:02 mycroft Exp $ */
/*
@@ -2999,6 +2999,7 @@ sys_getdents(struct proc *p, void *v, register_t *retval)
struct uio auio;
struct iovec aiov;
size_t buflen;
+ off_t offset;
int error, eofflag;
buflen = SCARG(uap, buflen);
@@ -3011,12 +3012,16 @@ sys_getdents(struct proc *p, void *v, register_t *retval)
error = EBADF;
goto bad;
}
- if (fp->f_offset < 0) {
+
+ offset = foffset_enter(fp);
+ if (offset < 0) {
+ foffset_leave(fp, 0, FO_NOUPDATE);
error = EINVAL;
goto bad;
}
vp = fp->f_data;
if (vp->v_type != VDIR) {
+ foffset_leave(fp, 0, FO_NOUPDATE);
error = EINVAL;
goto bad;
}
@@ -3029,10 +3034,10 @@ sys_getdents(struct proc *p, void *v, register_t *retval)
auio.uio_procp = p;
auio.uio_resid = buflen;
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
- auio.uio_offset = fp->f_offset;
+ auio.uio_offset = offset;
error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag);
- fp->f_offset = auio.uio_offset;
VOP_UNLOCK(vp);
+ foffset_leave(fp, auio.uio_offset, 0);
if (error)
goto bad;
*retval = buflen - auio.uio_resid;
diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c
index 520126602c1..43b1dbe1c85 100644
--- a/sys/kern/vfs_vnops.c
+++ b/sys/kern/vfs_vnops.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: vfs_vnops.c,v 1.99 2019/06/22 06:48:25 semarie Exp $ */
+/* $OpenBSD: vfs_vnops.c,v 1.100 2019/07/10 16:43:19 anton Exp $ */
/* $NetBSD: vfs_vnops.c,v 1.20 1996/02/04 02:18:41 christos Exp $ */
/*
@@ -342,38 +342,35 @@ vn_read(struct file *fp, struct uio *uio, int fflags)
size_t count = uio->uio_resid;
off_t offset;
int error;
+ int foflags = 0;
KERNEL_LOCK();
- /*
- * Check below can race. We can block on the vnode lock
- * and resume with a different `fp->f_offset' value.
- */
if ((fflags & FO_POSITION) == 0)
- offset = fp->f_offset;
+ uio->uio_offset = offset = foffset_enter(fp);
else
offset = uio->uio_offset;
/* no wrap around of offsets except on character devices */
if (vp->v_type != VCHR && count > LLONG_MAX - offset) {
+ foflags = FO_NOUPDATE;
error = EINVAL;
goto done;
}
if (vp->v_type == VDIR) {
+ foflags = FO_NOUPDATE;
error = EISDIR;
goto done;
}
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
- if ((fflags & FO_POSITION) == 0)
- uio->uio_offset = fp->f_offset;
error = VOP_READ(vp, uio, (fp->f_flag & FNONBLOCK) ? IO_NDELAY : 0,
cred);
- if ((fflags & FO_POSITION) == 0)
- fp->f_offset += count - uio->uio_resid;
VOP_UNLOCK(vp);
done:
+ if ((fflags & FO_POSITION) == 0)
+ foffset_leave(fp, offset + (count - uio->uio_resid), foflags);
KERNEL_UNLOCK();
return (error);
}
@@ -386,11 +383,15 @@ vn_write(struct file *fp, struct uio *uio, int fflags)
{
struct vnode *vp = fp->f_data;
struct ucred *cred = fp->f_cred;
+ off_t offset;
int error, ioflag = IO_UNIT;
size_t count;
KERNEL_LOCK();
+ if ((fflags & FO_POSITION) == 0)
+ uio->uio_offset = offset = foffset_enter(fp);
+
/* note: pwrite/pwritev are unaffected by O_APPEND */
if (vp->v_type == VREG && (fp->f_flag & O_APPEND) &&
(fflags & FO_POSITION) == 0)
@@ -401,17 +402,15 @@ vn_write(struct file *fp, struct uio *uio, int fflags)
(vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
ioflag |= IO_SYNC;
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
- if ((fflags & FO_POSITION) == 0)
- uio->uio_offset = fp->f_offset;
count = uio->uio_resid;
error = VOP_WRITE(vp, uio, ioflag, cred);
+ VOP_UNLOCK(vp);
if ((fflags & FO_POSITION) == 0) {
if (ioflag & IO_APPEND)
- fp->f_offset = uio->uio_offset;
+ foffset_leave(fp, uio->uio_offset, 0);
else
- fp->f_offset += count - uio->uio_resid;
+ foffset_leave(fp, offset + (count - uio->uio_resid), 0);
}
- VOP_UNLOCK(vp);
KERNEL_UNLOCK();
return (error);
@@ -509,7 +508,7 @@ vn_ioctl(struct file *fp, u_long com, caddr_t data, struct proc *p)
error = VOP_GETATTR(vp, &vattr, p->p_ucred, p);
if (error)
return (error);
- *(int *)data = vattr.va_size - fp->f_offset;
+ *(int *)data = vattr.va_size - foffset_get(fp);
return (0);
}
if (com == FIONBIO || com == FIOASYNC) /* XXX */
@@ -601,7 +600,7 @@ vn_seek(struct file *fp, off_t *offset, int whence, struct proc *p)
struct ucred *cred = p->p_ucred;
struct vnode *vp = fp->f_data;
struct vattr vattr;
- off_t newoff;
+ off_t curoff, newoff;
int error, special;
if (vp->v_type == VFIFO)
@@ -611,28 +610,35 @@ vn_seek(struct file *fp, off_t *offset, int whence, struct proc *p)
else
special = 0;
+ curoff = foffset_enter(fp);
switch (whence) {
case SEEK_CUR:
- newoff = fp->f_offset + *offset;
+ newoff = curoff + *offset;
break;
case SEEK_END:
error = VOP_GETATTR(vp, &vattr, cred, p);
if (error)
- return (error);
+ goto bad;
newoff = *offset + (off_t)vattr.va_size;
break;
case SEEK_SET:
newoff = *offset;
break;
default:
- return (EINVAL);
+ error = EINVAL;
+ goto bad;
}
- if (!special) {
- if (newoff < 0)
- return(EINVAL);
+ if (!special && newoff < 0) {
+ error = EINVAL;
+ goto bad;
}
- fp->f_offset = *offset = newoff;
+ foffset_leave(fp, newoff, 0);
+ *offset = newoff;
return (0);
+
+bad:
+ foffset_leave(fp, 0, FO_NOUPDATE);
+ return (error);
}
/*