summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoranton <anton@cvs.openbsd.org>2019-07-10 16:43:21 +0000
committeranton <anton@cvs.openbsd.org>2019-07-10 16:43:21 +0000
commitd8bf193ec09f26b570daa057ab9961744de1794d (patch)
treefb42455d3d5d0bb3236d5a9551cade60758d5ad7
parentcfb2b7e76d2d97505ffb3e2ac6a43561e980f07a (diff)
Make read/write of the f_offset field belonging to struct file MP-safe;
as part of the effort to unlock the kernel. Instead of relying on the vnode lock, introduce a dedicated lock per file. Exclusive write access is granted using the new foffset_enter and foffset_leave API. A convenience function foffset_get is also available for threads that only need to read the current offset. The lock acquisition order in vn_write has been changed to match the one in vn_read in order to avoid a potential deadlock. This change also gets rid of a documented race in vn_read(). Inspired by the FreeBSD implementation. With help and ok mpi@ visa@
-rw-r--r--sys/dev/pci/drm/drm_linux.c6
-rw-r--r--sys/isofs/cd9660/cd9660_vnops.c4
-rw-r--r--sys/kern/kern_descrip.c81
-rw-r--r--sys/kern/kern_sysctl.c4
-rw-r--r--sys/kern/vfs_syscalls.c13
-rw-r--r--sys/kern/vfs_vnops.c54
-rw-r--r--sys/miscfs/fuse/fuse_vnops.c4
-rw-r--r--sys/msdosfs/msdosfs_vnops.c4
-rw-r--r--sys/nfs/nfs_kq.c4
-rw-r--r--sys/sys/file.h14
-rw-r--r--sys/tmpfs/tmpfs_vnops.c4
-rw-r--r--sys/ufs/ufs/ufs_vnops.c6
12 files changed, 145 insertions, 53 deletions
diff --git a/sys/dev/pci/drm/drm_linux.c b/sys/dev/pci/drm/drm_linux.c
index e338d37390c..90dc896ad3e 100644
--- a/sys/dev/pci/drm/drm_linux.c
+++ b/sys/dev/pci/drm/drm_linux.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: drm_linux.c,v 1.42 2019/07/10 07:56:30 kettenis Exp $ */
+/* $OpenBSD: drm_linux.c,v 1.43 2019/07/10 16:43:19 anton Exp $ */
/*
* Copyright (c) 2013 Jonathan Gray <jsg@openbsd.org>
* Copyright (c) 2015, 2016 Mark Kettenis <kettenis@openbsd.org>
@@ -1346,7 +1346,9 @@ dmabuf_seek(struct file *fp, off_t *offset, int whence, struct proc *p)
default:
return (EINVAL);
}
- fp->f_offset = *offset = newoff;
+ foffset_enter(fp);
+ foffset_leave(fp, newoff, 0);
+ *offset = newoff;
return (0);
}
diff --git a/sys/isofs/cd9660/cd9660_vnops.c b/sys/isofs/cd9660/cd9660_vnops.c
index 99baef6c811..7ff1f74a015 100644
--- a/sys/isofs/cd9660/cd9660_vnops.c
+++ b/sys/isofs/cd9660/cd9660_vnops.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: cd9660_vnops.c,v 1.76 2016/06/19 11:54:33 natano Exp $ */
+/* $OpenBSD: cd9660_vnops.c,v 1.77 2019/07/10 16:43:19 anton Exp $ */
/* $NetBSD: cd9660_vnops.c,v 1.42 1997/10/16 23:56:57 christos Exp $ */
/*-
@@ -1016,7 +1016,7 @@ filt_cd9660read(struct knote *kn, long hint)
return (1);
}
- kn->kn_data = node->i_size - kn->kn_fp->f_offset;
+ kn->kn_data = node->i_size - foffset_get(kn->kn_fp);
if (kn->kn_data == 0 && kn->kn_sfflags & NOTE_EOF) {
kn->kn_fflags |= NOTE_EOF;
return (1);
diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c
index e2e4f1c668a..2cf66bb04aa 100644
--- a/sys/kern/kern_descrip.c
+++ b/sys/kern/kern_descrip.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: kern_descrip.c,v 1.188 2019/07/03 14:32:02 visa Exp $ */
+/* $OpenBSD: kern_descrip.c,v 1.189 2019/07/10 16:43:19 anton Exp $ */
/* $NetBSD: kern_descrip.c,v 1.42 1996/03/30 22:24:38 christos Exp $ */
/*
@@ -532,12 +532,14 @@ restart:
ktrflock(p, &fl);
#endif
if (fl.l_whence == SEEK_CUR) {
+ off_t offset = foffset_get(fp);
+
if (fl.l_start == 0 && fl.l_len < 0) {
/* lockf(3) compliance hack */
fl.l_len = -fl.l_len;
- fl.l_start = fp->f_offset - fl.l_len;
+ fl.l_start = offset - fl.l_len;
} else
- fl.l_start += fp->f_offset;
+ fl.l_start += offset;
}
switch (fl.l_type) {
@@ -602,12 +604,14 @@ restart:
if (error)
break;
if (fl.l_whence == SEEK_CUR) {
+ off_t offset = foffset_get(fp);
+
if (fl.l_start == 0 && fl.l_len < 0) {
/* lockf(3) compliance hack */
fl.l_len = -fl.l_len;
- fl.l_start = fp->f_offset - fl.l_len;
+ fl.l_start = offset - fl.l_len;
} else
- fl.l_start += fp->f_offset;
+ fl.l_start += offset;
}
if (fl.l_type != F_RDLCK &&
fl.l_type != F_WRLCK &&
@@ -1277,6 +1281,73 @@ fdrop(struct file *fp, struct proc *p)
}
/*
+ * Get the file offset without keeping the same offset locked upon return.
+ */
+off_t
+foffset_get(struct file *fp)
+{
+ off_t offset;
+
+ mtx_enter(&fp->f_mtx);
+ offset = fp->f_offset;
+ mtx_leave(&fp->f_mtx);
+ return (offset);
+}
+
+/*
+ * Acquire an exclusive lock of the file offset. The calling thread must call
+ * foffset_leave() once done.
+ */
+off_t
+foffset_enter(struct file *fp)
+{
+ off_t offset;
+
+ mtx_enter(&fp->f_mtx);
+
+ while (fp->f_olock & FOL_LOCKED) {
+ KASSERT((fp->f_olock & FOL_NWAIT) < FOL_NWAIT);
+ fp->f_olock++;
+ msleep(&fp->f_olock, &fp->f_mtx, PLOCK, "foffset", 0);
+ KASSERT((fp->f_olock & FOL_NWAIT) > 0);
+ fp->f_olock--;
+ }
+ fp->f_olock |= FOL_LOCKED;
+
+ offset = fp->f_offset;
+
+ mtx_leave(&fp->f_mtx);
+
+ return (offset);
+}
+
+/*
+ * Write a new file offset and release the lock. The calling thread must already
+ * have acquired the lock using foffset_enter().
+ * If FO_NOUPDATE is present in flags, only the lock is released and the offset
+ * remains unmodified.
+ */
+void
+foffset_leave(struct file *fp, off_t offset, int flags)
+{
+ unsigned int nwait;
+
+ mtx_enter(&fp->f_mtx);
+
+ KASSERT(fp->f_olock & FOL_LOCKED);
+
+ if ((flags & FO_NOUPDATE) == 0)
+ fp->f_offset = offset;
+ nwait = fp->f_olock & FOL_NWAIT;
+ fp->f_olock &= ~FOL_LOCKED;
+
+ mtx_leave(&fp->f_mtx);
+
+ if (nwait > 0)
+ wakeup_one(&fp->f_olock);
+}
+
+/*
* Apply an advisory lock on a file descriptor.
*
* Just attempt to get a record lock of the requested type on
diff --git a/sys/kern/kern_sysctl.c b/sys/kern/kern_sysctl.c
index 07e85ba692e..06eb7803191 100644
--- a/sys/kern/kern_sysctl.c
+++ b/sys/kern/kern_sysctl.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: kern_sysctl.c,v 1.360 2019/06/16 00:56:53 bluhm Exp $ */
+/* $OpenBSD: kern_sysctl.c,v 1.361 2019/07/10 16:43:19 anton Exp $ */
/* $NetBSD: kern_sysctl.c,v 1.17 1996/05/20 17:49:05 mrg Exp $ */
/*-
@@ -1100,8 +1100,8 @@ fill_file(struct kinfo_file *kf, struct file *fp, struct filedesc *fdp,
kf->f_usecount = 0;
if (suser(p) == 0 || p->p_ucred->cr_uid == fp->f_cred->cr_uid) {
- kf->f_offset = fp->f_offset;
mtx_enter(&fp->f_mtx);
+ kf->f_offset = fp->f_offset;
kf->f_rxfer = fp->f_rxfer;
kf->f_rwfer = fp->f_wxfer;
kf->f_seek = fp->f_seek;
diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c
index 37c2332537f..c334dc98a69 100644
--- a/sys/kern/vfs_syscalls.c
+++ b/sys/kern/vfs_syscalls.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: vfs_syscalls.c,v 1.319 2019/06/19 16:55:51 deraadt Exp $ */
+/* $OpenBSD: vfs_syscalls.c,v 1.320 2019/07/10 16:43:19 anton Exp $ */
/* $NetBSD: vfs_syscalls.c,v 1.71 1996/04/23 10:29:02 mycroft Exp $ */
/*
@@ -2999,6 +2999,7 @@ sys_getdents(struct proc *p, void *v, register_t *retval)
struct uio auio;
struct iovec aiov;
size_t buflen;
+ off_t offset;
int error, eofflag;
buflen = SCARG(uap, buflen);
@@ -3011,12 +3012,16 @@ sys_getdents(struct proc *p, void *v, register_t *retval)
error = EBADF;
goto bad;
}
- if (fp->f_offset < 0) {
+
+ offset = foffset_enter(fp);
+ if (offset < 0) {
+ foffset_leave(fp, 0, FO_NOUPDATE);
error = EINVAL;
goto bad;
}
vp = fp->f_data;
if (vp->v_type != VDIR) {
+ foffset_leave(fp, 0, FO_NOUPDATE);
error = EINVAL;
goto bad;
}
@@ -3029,10 +3034,10 @@ sys_getdents(struct proc *p, void *v, register_t *retval)
auio.uio_procp = p;
auio.uio_resid = buflen;
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
- auio.uio_offset = fp->f_offset;
+ auio.uio_offset = offset;
error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag);
- fp->f_offset = auio.uio_offset;
VOP_UNLOCK(vp);
+ foffset_leave(fp, auio.uio_offset, 0);
if (error)
goto bad;
*retval = buflen - auio.uio_resid;
diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c
index 520126602c1..43b1dbe1c85 100644
--- a/sys/kern/vfs_vnops.c
+++ b/sys/kern/vfs_vnops.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: vfs_vnops.c,v 1.99 2019/06/22 06:48:25 semarie Exp $ */
+/* $OpenBSD: vfs_vnops.c,v 1.100 2019/07/10 16:43:19 anton Exp $ */
/* $NetBSD: vfs_vnops.c,v 1.20 1996/02/04 02:18:41 christos Exp $ */
/*
@@ -342,38 +342,35 @@ vn_read(struct file *fp, struct uio *uio, int fflags)
size_t count = uio->uio_resid;
off_t offset;
int error;
+ int foflags = 0;
KERNEL_LOCK();
- /*
- * Check below can race. We can block on the vnode lock
- * and resume with a different `fp->f_offset' value.
- */
if ((fflags & FO_POSITION) == 0)
- offset = fp->f_offset;
+ uio->uio_offset = offset = foffset_enter(fp);
else
offset = uio->uio_offset;
/* no wrap around of offsets except on character devices */
if (vp->v_type != VCHR && count > LLONG_MAX - offset) {
+ foflags = FO_NOUPDATE;
error = EINVAL;
goto done;
}
if (vp->v_type == VDIR) {
+ foflags = FO_NOUPDATE;
error = EISDIR;
goto done;
}
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
- if ((fflags & FO_POSITION) == 0)
- uio->uio_offset = fp->f_offset;
error = VOP_READ(vp, uio, (fp->f_flag & FNONBLOCK) ? IO_NDELAY : 0,
cred);
- if ((fflags & FO_POSITION) == 0)
- fp->f_offset += count - uio->uio_resid;
VOP_UNLOCK(vp);
done:
+ if ((fflags & FO_POSITION) == 0)
+ foffset_leave(fp, offset + (count - uio->uio_resid), foflags);
KERNEL_UNLOCK();
return (error);
}
@@ -386,11 +383,15 @@ vn_write(struct file *fp, struct uio *uio, int fflags)
{
struct vnode *vp = fp->f_data;
struct ucred *cred = fp->f_cred;
+ off_t offset;
int error, ioflag = IO_UNIT;
size_t count;
KERNEL_LOCK();
+ if ((fflags & FO_POSITION) == 0)
+ uio->uio_offset = offset = foffset_enter(fp);
+
/* note: pwrite/pwritev are unaffected by O_APPEND */
if (vp->v_type == VREG && (fp->f_flag & O_APPEND) &&
(fflags & FO_POSITION) == 0)
@@ -401,17 +402,15 @@ vn_write(struct file *fp, struct uio *uio, int fflags)
(vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
ioflag |= IO_SYNC;
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
- if ((fflags & FO_POSITION) == 0)
- uio->uio_offset = fp->f_offset;
count = uio->uio_resid;
error = VOP_WRITE(vp, uio, ioflag, cred);
+ VOP_UNLOCK(vp);
if ((fflags & FO_POSITION) == 0) {
if (ioflag & IO_APPEND)
- fp->f_offset = uio->uio_offset;
+ foffset_leave(fp, uio->uio_offset, 0);
else
- fp->f_offset += count - uio->uio_resid;
+ foffset_leave(fp, offset + (count - uio->uio_resid), 0);
}
- VOP_UNLOCK(vp);
KERNEL_UNLOCK();
return (error);
@@ -509,7 +508,7 @@ vn_ioctl(struct file *fp, u_long com, caddr_t data, struct proc *p)
error = VOP_GETATTR(vp, &vattr, p->p_ucred, p);
if (error)
return (error);
- *(int *)data = vattr.va_size - fp->f_offset;
+ *(int *)data = vattr.va_size - foffset_get(fp);
return (0);
}
if (com == FIONBIO || com == FIOASYNC) /* XXX */
@@ -601,7 +600,7 @@ vn_seek(struct file *fp, off_t *offset, int whence, struct proc *p)
struct ucred *cred = p->p_ucred;
struct vnode *vp = fp->f_data;
struct vattr vattr;
- off_t newoff;
+ off_t curoff, newoff;
int error, special;
if (vp->v_type == VFIFO)
@@ -611,28 +610,35 @@ vn_seek(struct file *fp, off_t *offset, int whence, struct proc *p)
else
special = 0;
+ curoff = foffset_enter(fp);
switch (whence) {
case SEEK_CUR:
- newoff = fp->f_offset + *offset;
+ newoff = curoff + *offset;
break;
case SEEK_END:
error = VOP_GETATTR(vp, &vattr, cred, p);
if (error)
- return (error);
+ goto bad;
newoff = *offset + (off_t)vattr.va_size;
break;
case SEEK_SET:
newoff = *offset;
break;
default:
- return (EINVAL);
+ error = EINVAL;
+ goto bad;
}
- if (!special) {
- if (newoff < 0)
- return(EINVAL);
+ if (!special && newoff < 0) {
+ error = EINVAL;
+ goto bad;
}
- fp->f_offset = *offset = newoff;
+ foffset_leave(fp, newoff, 0);
+ *offset = newoff;
return (0);
+
+bad:
+ foffset_leave(fp, 0, FO_NOUPDATE);
+ return (error);
}
/*
diff --git a/sys/miscfs/fuse/fuse_vnops.c b/sys/miscfs/fuse/fuse_vnops.c
index c28e9efe1c7..c4351c3155b 100644
--- a/sys/miscfs/fuse/fuse_vnops.c
+++ b/sys/miscfs/fuse/fuse_vnops.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: fuse_vnops.c,v 1.52 2018/07/18 10:47:02 helg Exp $ */
+/* $OpenBSD: fuse_vnops.c,v 1.53 2019/07/10 16:43:19 anton Exp $ */
/*
* Copyright (c) 2012-2013 Sylvestre Gallon <ccna.syl@gmail.com>
*
@@ -168,7 +168,7 @@ filt_fusefsread(struct knote *kn, long hint)
return (1);
}
- kn->kn_data = ip->filesize - kn->kn_fp->f_offset;
+ kn->kn_data = ip->filesize - foffset_get(kn->kn_fp);
if (kn->kn_data == 0 && kn->kn_sfflags & NOTE_EOF) {
kn->kn_fflags |= NOTE_EOF;
return (1);
diff --git a/sys/msdosfs/msdosfs_vnops.c b/sys/msdosfs/msdosfs_vnops.c
index a2e465cbb3a..d39361ce732 100644
--- a/sys/msdosfs/msdosfs_vnops.c
+++ b/sys/msdosfs/msdosfs_vnops.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: msdosfs_vnops.c,v 1.122 2018/06/21 14:17:23 visa Exp $ */
+/* $OpenBSD: msdosfs_vnops.c,v 1.123 2019/07/10 16:43:19 anton Exp $ */
/* $NetBSD: msdosfs_vnops.c,v 1.63 1997/10/17 11:24:19 ws Exp $ */
/*-
@@ -2017,7 +2017,7 @@ filt_msdosfsread(struct knote *kn, long hint)
return (1);
}
- kn->kn_data = dep->de_FileSize - kn->kn_fp->f_offset;
+ kn->kn_data = dep->de_FileSize - foffset_get(kn->kn_fp);
if (kn->kn_data == 0 && kn->kn_sfflags & NOTE_EOF) {
kn->kn_fflags |= NOTE_EOF;
return (1);
diff --git a/sys/nfs/nfs_kq.c b/sys/nfs/nfs_kq.c
index b6b227c52a7..459e40907a7 100644
--- a/sys/nfs/nfs_kq.c
+++ b/sys/nfs/nfs_kq.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: nfs_kq.c,v 1.22 2014/11/15 00:03:12 tedu Exp $ */
+/* $OpenBSD: nfs_kq.c,v 1.23 2019/07/10 16:43:20 anton Exp $ */
/* $NetBSD: nfs_kq.c,v 1.7 2003/10/30 01:43:10 simonb Exp $ */
/*-
@@ -226,7 +226,7 @@ filt_nfsread(struct knote *kn, long hint)
return (1);
}
- kn->kn_data = np->n_size - kn->kn_fp->f_offset;
+ kn->kn_data = np->n_size - foffset_get(kn->kn_fp);
#ifdef DEBUG
printf("nfsread event. %lld\n", kn->kn_data);
#endif
diff --git a/sys/sys/file.h b/sys/sys/file.h
index 0054c6df76b..7c5e3aa0468 100644
--- a/sys/sys/file.h
+++ b/sys/sys/file.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: file.h,v 1.54 2019/06/22 06:48:25 semarie Exp $ */
+/* $OpenBSD: file.h,v 1.55 2019/07/10 16:43:20 anton Exp $ */
/* $NetBSD: file.h,v 1.11 1995/03/26 20:24:13 jtc Exp $ */
/*
@@ -64,7 +64,7 @@ struct fileops {
int (*fo_seek)(struct file *, off_t *, int, struct proc *);
};
#define FO_POSITION 0x01 /* positioned read/write */
-
+#define FO_NOUPDATE 0x00000002 /* don't update file offset */
/*
* Kernel descriptor table.
@@ -90,8 +90,9 @@ struct file {
u_int f_count; /* [a] reference count */
struct ucred *f_cred; /* [I] credentials associated with descriptor */
struct fileops *f_ops; /* [I] file operation pointers */
- off_t f_offset; /* [k] */
+ off_t f_offset; /* [f] */
void *f_data; /* [I] private data */
+ u_int f_olock; /* [f] offset lock */
int f_iflags; /* [k] internal flags */
uint64_t f_rxfer; /* [f] total number of read transfers */
uint64_t f_wxfer; /* [f] total number of write transfers */
@@ -117,6 +118,13 @@ struct file {
int fdrop(struct file *, struct proc *);
+off_t foffset_get(struct file *);
+off_t foffset_enter(struct file *);
+void foffset_leave(struct file *, off_t, int);
+
+#define FOL_NWAIT 0x7fffffffu /* number of waiters */
+#define FOL_LOCKED 0x80000000u /* file offset is locked */
+
LIST_HEAD(filelist, file);
extern int maxfiles; /* kernel limit on number of open files */
extern int numfiles; /* actual number of open files */
diff --git a/sys/tmpfs/tmpfs_vnops.c b/sys/tmpfs/tmpfs_vnops.c
index 8b5c7a76b46..69a814ae26a 100644
--- a/sys/tmpfs/tmpfs_vnops.c
+++ b/sys/tmpfs/tmpfs_vnops.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: tmpfs_vnops.c,v 1.33 2018/10/22 17:31:25 krw Exp $ */
+/* $OpenBSD: tmpfs_vnops.c,v 1.34 2019/07/10 16:43:20 anton Exp $ */
/* $NetBSD: tmpfs_vnops.c,v 1.100 2012/11/05 17:27:39 dholland Exp $ */
/*
@@ -2645,7 +2645,7 @@ filt_tmpfsread(struct knote *kn, long hint)
return (1);
}
- kn->kn_data = node->tn_size - kn->kn_fp->f_offset;
+ kn->kn_data = node->tn_size - foffset_get(kn->kn_fp);
if (kn->kn_data == 0 && kn->kn_sfflags & NOTE_EOF) {
kn->kn_fflags |= NOTE_EOF;
return (1);
diff --git a/sys/ufs/ufs/ufs_vnops.c b/sys/ufs/ufs/ufs_vnops.c
index 82ebd356af0..224a37250a2 100644
--- a/sys/ufs/ufs/ufs_vnops.c
+++ b/sys/ufs/ufs/ufs_vnops.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: ufs_vnops.c,v 1.143 2018/12/23 10:46:51 natano Exp $ */
+/* $OpenBSD: ufs_vnops.c,v 1.144 2019/07/10 16:43:20 anton Exp $ */
/* $NetBSD: ufs_vnops.c,v 1.18 1996/05/11 18:28:04 mycroft Exp $ */
/*
@@ -1952,10 +1952,10 @@ filt_ufsread(struct knote *kn, long hint)
#ifdef EXT2FS
if (IS_EXT2_VNODE(ip->i_vnode))
- kn->kn_data = ext2fs_size(ip) - kn->kn_fp->f_offset;
+ kn->kn_data = ext2fs_size(ip) - foffset_get(kn->kn_fp);
else
#endif
- kn->kn_data = DIP(ip, size) - kn->kn_fp->f_offset;
+ kn->kn_data = DIP(ip, size) - foffset_get(kn->kn_fp);
if (kn->kn_data == 0 && kn->kn_sfflags & NOTE_EOF) {
kn->kn_fflags |= NOTE_EOF;
return (1);