From d8bf193ec09f26b570daa057ab9961744de1794d Mon Sep 17 00:00:00 2001
From: anton <anton@cvs.openbsd.org>
Date: Wed, 10 Jul 2019 16:43:21 +0000
Subject: Make read/write of the f_offset field belonging to struct file
 MP-safe; as part of the effort to unlock the kernel. Instead of relying on
 the vnode lock, introduce a dedicated lock per file. Exclusive write access
 is granted using the new foffset_enter and foffset_leave API. A convenience
 function foffset_get is also available for threads that only need to read the
 current offset.

The lock acquisition order in vn_write has been changed to match the one
in vn_read in order to avoid a potential deadlock. This change also gets
rid of a documented race in vn_read().

Inspired by the FreeBSD implementation.

With help and ok mpi@ visa@
---
 sys/kern/kern_descrip.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++---
 sys/kern/kern_sysctl.c  |  4 +--
 sys/kern/vfs_syscalls.c | 13 +++++---
 sys/kern/vfs_vnops.c    | 54 ++++++++++++++++++---------------
 4 files changed, 117 insertions(+), 35 deletions(-)

(limited to 'sys/kern')

diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c
index e2e4f1c668a..2cf66bb04aa 100644
--- a/sys/kern/kern_descrip.c
+++ b/sys/kern/kern_descrip.c
@@ -1,4 +1,4 @@
-/*	$OpenBSD: kern_descrip.c,v 1.188 2019/07/03 14:32:02 visa Exp $	*/
+/*	$OpenBSD: kern_descrip.c,v 1.189 2019/07/10 16:43:19 anton Exp $	*/
 /*	$NetBSD: kern_descrip.c,v 1.42 1996/03/30 22:24:38 christos Exp $	*/
 
 /*
@@ -532,12 +532,14 @@ restart:
 			ktrflock(p, &fl);
 #endif
 		if (fl.l_whence == SEEK_CUR) {
+			off_t offset = foffset_get(fp);
+
 			if (fl.l_start == 0 && fl.l_len < 0) {
 				/* lockf(3) compliance hack */
 				fl.l_len = -fl.l_len;
-				fl.l_start = fp->f_offset - fl.l_len;
+				fl.l_start = offset - fl.l_len;
 			} else
-				fl.l_start += fp->f_offset;
+				fl.l_start += offset;
 		}
 		switch (fl.l_type) {
 
@@ -602,12 +604,14 @@ restart:
 		if (error)
 			break;
 		if (fl.l_whence == SEEK_CUR) {
+			off_t offset = foffset_get(fp);
+
 			if (fl.l_start == 0 && fl.l_len < 0) {
 				/* lockf(3) compliance hack */
 				fl.l_len = -fl.l_len;
-				fl.l_start = fp->f_offset - fl.l_len;
+				fl.l_start = offset - fl.l_len;
 			} else
-				fl.l_start += fp->f_offset;
+				fl.l_start += offset;
 		}
 		if (fl.l_type != F_RDLCK &&
 		    fl.l_type != F_WRLCK &&
@@ -1276,6 +1280,73 @@ fdrop(struct file *fp, struct proc *p)
 	return (error);
 }
 
+/*
+ * Get the file offset without keeping the same offset locked upon return.
+ */
+off_t
+foffset_get(struct file *fp)
+{
+	off_t offset;
+
+	mtx_enter(&fp->f_mtx);
+	offset = fp->f_offset;
+	mtx_leave(&fp->f_mtx);
+	return (offset);
+}
+
+/*
+ * Acquire an exclusive lock of the file offset. The calling thread must call
+ * foffset_leave() once done.
+ */
+off_t
+foffset_enter(struct file *fp)
+{
+	off_t offset;
+
+	mtx_enter(&fp->f_mtx);
+
+	while (fp->f_olock & FOL_LOCKED) {
+		KASSERT((fp->f_olock & FOL_NWAIT) < FOL_NWAIT);
+		fp->f_olock++;
+		msleep(&fp->f_olock, &fp->f_mtx, PLOCK, "foffset", 0);
+		KASSERT((fp->f_olock & FOL_NWAIT) > 0);
+		fp->f_olock--;
+	}
+	fp->f_olock |= FOL_LOCKED;
+
+	offset = fp->f_offset;
+
+	mtx_leave(&fp->f_mtx);
+
+	return (offset);
+}
+
+/*
+ * Write a new file offset and release the lock. The calling thread must already
+ * have acquired the lock using foffset_enter().
+ * If FO_NOUPDATE is present in flags, only the lock is released and the offset
+ * remains unmodified.
+ */
+void
+foffset_leave(struct file *fp, off_t offset, int flags)
+{
+	unsigned int nwait;
+
+	mtx_enter(&fp->f_mtx);
+
+	KASSERT(fp->f_olock & FOL_LOCKED);
+
+	if ((flags & FO_NOUPDATE) == 0)
+		fp->f_offset = offset;
+	nwait = fp->f_olock & FOL_NWAIT;
+	fp->f_olock &= ~FOL_LOCKED;
+
+	mtx_leave(&fp->f_mtx);
+
+	if (nwait > 0)
+		wakeup_one(&fp->f_olock);
+}
+
 /*
  * Apply an advisory lock on a file descriptor.
  *
diff --git a/sys/kern/kern_sysctl.c b/sys/kern/kern_sysctl.c
index 07e85ba692e..06eb7803191 100644
--- a/sys/kern/kern_sysctl.c
+++ b/sys/kern/kern_sysctl.c
@@ -1,4 +1,4 @@
-/*	$OpenBSD: kern_sysctl.c,v 1.360 2019/06/16 00:56:53 bluhm Exp $	*/
+/*	$OpenBSD: kern_sysctl.c,v 1.361 2019/07/10 16:43:19 anton Exp $	*/
 /*	$NetBSD: kern_sysctl.c,v 1.17 1996/05/20 17:49:05 mrg Exp $	*/
 
 /*-
@@ -1100,8 +1100,8 @@ fill_file(struct kinfo_file *kf, struct file *fp, struct filedesc *fdp,
 		kf->f_usecount = 0;
 
 		if (suser(p) == 0 || p->p_ucred->cr_uid == fp->f_cred->cr_uid) {
-			kf->f_offset = fp->f_offset;
 			mtx_enter(&fp->f_mtx);
+			kf->f_offset = fp->f_offset;
 			kf->f_rxfer = fp->f_rxfer;
 			kf->f_rwfer = fp->f_wxfer;
 			kf->f_seek = fp->f_seek;
diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c
index 37c2332537f..c334dc98a69 100644
--- a/sys/kern/vfs_syscalls.c
+++ b/sys/kern/vfs_syscalls.c
@@ -1,4 +1,4 @@
-/*	$OpenBSD: vfs_syscalls.c,v 1.319 2019/06/19 16:55:51 deraadt Exp $	*/
+/*	$OpenBSD: vfs_syscalls.c,v 1.320 2019/07/10 16:43:19 anton Exp $	*/
 /*	$NetBSD: vfs_syscalls.c,v 1.71 1996/04/23 10:29:02 mycroft Exp $	*/
 
 /*
@@ -2999,6 +2999,7 @@ sys_getdents(struct proc *p, void *v, register_t *retval)
 	struct uio auio;
 	struct iovec aiov;
 	size_t buflen;
+	off_t offset;
 	int error, eofflag;
 
 	buflen = SCARG(uap, buflen);
@@ -3011,12 +3012,16 @@ sys_getdents(struct proc *p, void *v, register_t *retval)
 		error = EBADF;
 		goto bad;
 	}
-	if (fp->f_offset < 0) {
+
+	offset = foffset_enter(fp);
+	if (offset < 0) {
+		foffset_leave(fp, 0, FO_NOUPDATE);
 		error = EINVAL;
 		goto bad;
 	}
 	vp = fp->f_data;
 	if (vp->v_type != VDIR) {
+		foffset_leave(fp, 0, FO_NOUPDATE);
 		error = EINVAL;
 		goto bad;
 	}
@@ -3029,10 +3034,10 @@ sys_getdents(struct proc *p, void *v, register_t *retval)
 	auio.uio_procp = p;
 	auio.uio_resid = buflen;
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
-	auio.uio_offset = fp->f_offset;
+	auio.uio_offset = offset;
 	error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag);
-	fp->f_offset = auio.uio_offset;
 	VOP_UNLOCK(vp);
+	foffset_leave(fp, auio.uio_offset, 0);
 	if (error)
 		goto bad;
 	*retval = buflen - auio.uio_resid;
diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c
index 520126602c1..43b1dbe1c85 100644
--- a/sys/kern/vfs_vnops.c
+++ b/sys/kern/vfs_vnops.c
@@ -1,4 +1,4 @@
-/*	$OpenBSD: vfs_vnops.c,v 1.99 2019/06/22 06:48:25 semarie Exp $	*/
+/*	$OpenBSD: vfs_vnops.c,v 1.100 2019/07/10 16:43:19 anton Exp $	*/
 /*	$NetBSD: vfs_vnops.c,v 1.20 1996/02/04 02:18:41 christos Exp $	*/
 
 /*
@@ -342,38 +342,35 @@ vn_read(struct file *fp, struct uio *uio, int fflags)
 	size_t count = uio->uio_resid;
 	off_t offset;
 	int error;
+	int foflags = 0;
 
 	KERNEL_LOCK();
 
-	/*
-	 * Check below can race.  We can block on the vnode lock
-	 * and resume with a different `fp->f_offset' value.
-	 */
 	if ((fflags & FO_POSITION) == 0)
-		offset = fp->f_offset;
+		uio->uio_offset = offset = foffset_enter(fp);
 	else
 		offset = uio->uio_offset;
 
 	/* no wrap around of offsets except on character devices */
 	if (vp->v_type != VCHR && count > LLONG_MAX - offset) {
+		foflags = FO_NOUPDATE;
 		error = EINVAL;
 		goto done;
 	}
 
 	if (vp->v_type == VDIR) {
+		foflags = FO_NOUPDATE;
 		error = EISDIR;
 		goto done;
 	}
 
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
-	if ((fflags & FO_POSITION) == 0)
-		uio->uio_offset = fp->f_offset;
 	error = VOP_READ(vp, uio, (fp->f_flag & FNONBLOCK) ? IO_NDELAY : 0,
 	    cred);
-	if ((fflags & FO_POSITION) == 0)
-		fp->f_offset += count - uio->uio_resid;
 	VOP_UNLOCK(vp);
 done:
+	if ((fflags & FO_POSITION) == 0)
+		foffset_leave(fp, offset + (count - uio->uio_resid), foflags);
 	KERNEL_UNLOCK();
 	return (error);
 }
@@ -386,11 +383,15 @@ vn_write(struct file *fp, struct uio *uio, int fflags)
 {
 	struct vnode *vp = fp->f_data;
 	struct ucred *cred = fp->f_cred;
+	off_t offset;
 	int error, ioflag = IO_UNIT;
 	size_t count;
 
 	KERNEL_LOCK();
 
+	if ((fflags & FO_POSITION) == 0)
+		uio->uio_offset = offset = foffset_enter(fp);
+
 	/* note: pwrite/pwritev are unaffected by O_APPEND */
 	if (vp->v_type == VREG && (fp->f_flag & O_APPEND) &&
 	    (fflags & FO_POSITION) == 0)
@@ -401,17 +402,15 @@ vn_write(struct file *fp, struct uio *uio, int fflags)
 	    (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
 		ioflag |= IO_SYNC;
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
-	if ((fflags & FO_POSITION) == 0)
-		uio->uio_offset = fp->f_offset;
 	count = uio->uio_resid;
 	error = VOP_WRITE(vp, uio, ioflag, cred);
+	VOP_UNLOCK(vp);
 	if ((fflags & FO_POSITION) == 0) {
 		if (ioflag & IO_APPEND)
-			fp->f_offset = uio->uio_offset;
+			foffset_leave(fp, uio->uio_offset, 0);
 		else
-			fp->f_offset += count - uio->uio_resid;
+			foffset_leave(fp, offset + (count - uio->uio_resid), 0);
 	}
-	VOP_UNLOCK(vp);
 
 	KERNEL_UNLOCK();
 	return (error);
@@ -509,7 +508,7 @@ vn_ioctl(struct file *fp, u_long com, caddr_t data, struct proc *p)
 			error = VOP_GETATTR(vp, &vattr, p->p_ucred, p);
 			if (error)
 				return (error);
-			*(int *)data = vattr.va_size - fp->f_offset;
+			*(int *)data = vattr.va_size - foffset_get(fp);
 			return (0);
 		}
 		if (com == FIONBIO || com == FIOASYNC)  /* XXX */
@@ -601,7 +600,7 @@ vn_seek(struct file *fp, off_t *offset, int whence, struct proc *p)
 	struct ucred *cred = p->p_ucred;
 	struct vnode *vp = fp->f_data;
 	struct vattr vattr;
-	off_t newoff;
+	off_t curoff, newoff;
 	int error, special;
 
 	if (vp->v_type == VFIFO)
@@ -611,28 +610,35 @@ vn_seek(struct file *fp, off_t *offset, int whence, struct proc *p)
 	else
 		special = 0;
 
+	curoff = foffset_enter(fp);
 	switch (whence) {
 	case SEEK_CUR:
-		newoff = fp->f_offset + *offset;
+		newoff = curoff + *offset;
 		break;
 	case SEEK_END:
 		error = VOP_GETATTR(vp, &vattr, cred, p);
 		if (error)
-			return (error);
+			goto bad;
 		newoff = *offset + (off_t)vattr.va_size;
 		break;
 	case SEEK_SET:
 		newoff = *offset;
 		break;
 	default:
-		return (EINVAL);
+		error = EINVAL;
+		goto bad;
 	}
-	if (!special) {
-		if (newoff < 0)
-			return(EINVAL);
+	if (!special && newoff < 0) {
+		error = EINVAL;
+		goto bad;
 	}
-	fp->f_offset = *offset = newoff;
+	foffset_leave(fp, newoff, 0);
+	*offset = newoff;
 	return (0);
+
+bad:
+	foffset_leave(fp, 0, FO_NOUPDATE);
+	return (error);
 }
 
 /*
-- 
cgit v1.2.3