diff options
author | anton <anton@cvs.openbsd.org> | 2019-07-10 16:43:21 +0000 |
---|---|---|
committer | anton <anton@cvs.openbsd.org> | 2019-07-10 16:43:21 +0000 |
commit | d8bf193ec09f26b570daa057ab9961744de1794d (patch) | |
tree | fb42455d3d5d0bb3236d5a9551cade60758d5ad7 | |
parent | cfb2b7e76d2d97505ffb3e2ac6a43561e980f07a (diff) |
Make read/write of the f_offset field belonging to struct file MP-safe;
as part of the effort to unlock the kernel. Instead of relying on the
vnode lock, introduce a dedicated lock per file. Exclusive write access
is granted using the new foffset_enter and foffset_leave API. A
convenience function foffset_get is also available for threads that only
need to read the current offset.
The lock acquisition order in vn_write has been changed to match the one
in vn_read in order to avoid a potential deadlock. This change also gets
rid of a documented race in vn_read().
Inspired by the FreeBSD implementation.
With help and ok mpi@ visa@
-rw-r--r-- | sys/dev/pci/drm/drm_linux.c | 6 | ||||
-rw-r--r-- | sys/isofs/cd9660/cd9660_vnops.c | 4 | ||||
-rw-r--r-- | sys/kern/kern_descrip.c | 81 | ||||
-rw-r--r-- | sys/kern/kern_sysctl.c | 4 | ||||
-rw-r--r-- | sys/kern/vfs_syscalls.c | 13 | ||||
-rw-r--r-- | sys/kern/vfs_vnops.c | 54 | ||||
-rw-r--r-- | sys/miscfs/fuse/fuse_vnops.c | 4 | ||||
-rw-r--r-- | sys/msdosfs/msdosfs_vnops.c | 4 | ||||
-rw-r--r-- | sys/nfs/nfs_kq.c | 4 | ||||
-rw-r--r-- | sys/sys/file.h | 14 | ||||
-rw-r--r-- | sys/tmpfs/tmpfs_vnops.c | 4 | ||||
-rw-r--r-- | sys/ufs/ufs/ufs_vnops.c | 6 |
12 files changed, 145 insertions, 53 deletions
diff --git a/sys/dev/pci/drm/drm_linux.c b/sys/dev/pci/drm/drm_linux.c index e338d37390c..90dc896ad3e 100644 --- a/sys/dev/pci/drm/drm_linux.c +++ b/sys/dev/pci/drm/drm_linux.c @@ -1,4 +1,4 @@ -/* $OpenBSD: drm_linux.c,v 1.42 2019/07/10 07:56:30 kettenis Exp $ */ +/* $OpenBSD: drm_linux.c,v 1.43 2019/07/10 16:43:19 anton Exp $ */ /* * Copyright (c) 2013 Jonathan Gray <jsg@openbsd.org> * Copyright (c) 2015, 2016 Mark Kettenis <kettenis@openbsd.org> @@ -1346,7 +1346,9 @@ dmabuf_seek(struct file *fp, off_t *offset, int whence, struct proc *p) default: return (EINVAL); } - fp->f_offset = *offset = newoff; + foffset_enter(fp); + foffset_leave(fp, newoff, 0); + *offset = newoff; return (0); } diff --git a/sys/isofs/cd9660/cd9660_vnops.c b/sys/isofs/cd9660/cd9660_vnops.c index 99baef6c811..7ff1f74a015 100644 --- a/sys/isofs/cd9660/cd9660_vnops.c +++ b/sys/isofs/cd9660/cd9660_vnops.c @@ -1,4 +1,4 @@ -/* $OpenBSD: cd9660_vnops.c,v 1.76 2016/06/19 11:54:33 natano Exp $ */ +/* $OpenBSD: cd9660_vnops.c,v 1.77 2019/07/10 16:43:19 anton Exp $ */ /* $NetBSD: cd9660_vnops.c,v 1.42 1997/10/16 23:56:57 christos Exp $ */ /*- @@ -1016,7 +1016,7 @@ filt_cd9660read(struct knote *kn, long hint) return (1); } - kn->kn_data = node->i_size - kn->kn_fp->f_offset; + kn->kn_data = node->i_size - foffset_get(kn->kn_fp); if (kn->kn_data == 0 && kn->kn_sfflags & NOTE_EOF) { kn->kn_fflags |= NOTE_EOF; return (1); diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c index e2e4f1c668a..2cf66bb04aa 100644 --- a/sys/kern/kern_descrip.c +++ b/sys/kern/kern_descrip.c @@ -1,4 +1,4 @@ -/* $OpenBSD: kern_descrip.c,v 1.188 2019/07/03 14:32:02 visa Exp $ */ +/* $OpenBSD: kern_descrip.c,v 1.189 2019/07/10 16:43:19 anton Exp $ */ /* $NetBSD: kern_descrip.c,v 1.42 1996/03/30 22:24:38 christos Exp $ */ /* @@ -532,12 +532,14 @@ restart: ktrflock(p, &fl); #endif if (fl.l_whence == SEEK_CUR) { + off_t offset = foffset_get(fp); + if (fl.l_start == 0 && fl.l_len < 0) { /* lockf(3) compliance hack */ fl.l_len = -fl.l_len; - fl.l_start = fp->f_offset - fl.l_len; + fl.l_start = offset - fl.l_len; } else - fl.l_start += fp->f_offset; + fl.l_start += offset; } switch (fl.l_type) { @@ -602,12 +604,14 @@ restart: if (error) break; if (fl.l_whence == SEEK_CUR) { + off_t offset = foffset_get(fp); + if (fl.l_start == 0 && fl.l_len < 0) { /* lockf(3) compliance hack */ fl.l_len = -fl.l_len; - fl.l_start = fp->f_offset - fl.l_len; + fl.l_start = offset - fl.l_len; } else - fl.l_start += fp->f_offset; + fl.l_start += offset; } if (fl.l_type != F_RDLCK && fl.l_type != F_WRLCK && @@ -1277,6 +1281,73 @@ fdrop(struct file *fp, struct proc *p) } /* + * Get the file offset without keeping the same offset locked upon return. + */ +off_t +foffset_get(struct file *fp) +{ + off_t offset; + + mtx_enter(&fp->f_mtx); + offset = fp->f_offset; + mtx_leave(&fp->f_mtx); + return (offset); +} + +/* + * Acquire an exclusive lock of the file offset. The calling thread must call + * foffset_leave() once done. + */ +off_t +foffset_enter(struct file *fp) +{ + off_t offset; + + mtx_enter(&fp->f_mtx); + + while (fp->f_olock & FOL_LOCKED) { + KASSERT((fp->f_olock & FOL_NWAIT) < FOL_NWAIT); + fp->f_olock++; + msleep(&fp->f_olock, &fp->f_mtx, PLOCK, "foffset", 0); + KASSERT((fp->f_olock & FOL_NWAIT) > 0); + fp->f_olock--; + } + fp->f_olock |= FOL_LOCKED; + + offset = fp->f_offset; + + mtx_leave(&fp->f_mtx); + + return (offset); +} + +/* + * Write a new file offset and release the lock. The calling thread must already + * have acquired the lock using foffset_enter(). + * If FO_NOUPDATE is present in flags, only the lock is released and the offset + * remains unmodified. + */ +void +foffset_leave(struct file *fp, off_t offset, int flags) +{ + unsigned int nwait; + + mtx_enter(&fp->f_mtx); + + KASSERT(fp->f_olock & FOL_LOCKED); + + if ((flags & FO_NOUPDATE) == 0) + fp->f_offset = offset; + nwait = fp->f_olock & FOL_NWAIT; + fp->f_olock &= ~FOL_LOCKED; + + mtx_leave(&fp->f_mtx); + + if (nwait > 0) + wakeup_one(&fp->f_olock); +} + +/* * Apply an advisory lock on a file descriptor. * * Just attempt to get a record lock of the requested type on diff --git a/sys/kern/kern_sysctl.c b/sys/kern/kern_sysctl.c index 07e85ba692e..06eb7803191 100644 --- a/sys/kern/kern_sysctl.c +++ b/sys/kern/kern_sysctl.c @@ -1,4 +1,4 @@ -/* $OpenBSD: kern_sysctl.c,v 1.360 2019/06/16 00:56:53 bluhm Exp $ */ +/* $OpenBSD: kern_sysctl.c,v 1.361 2019/07/10 16:43:19 anton Exp $ */ /* $NetBSD: kern_sysctl.c,v 1.17 1996/05/20 17:49:05 mrg Exp $ */ /*- @@ -1100,8 +1100,8 @@ fill_file(struct kinfo_file *kf, struct file *fp, struct filedesc *fdp, kf->f_usecount = 0; if (suser(p) == 0 || p->p_ucred->cr_uid == fp->f_cred->cr_uid) { - kf->f_offset = fp->f_offset; mtx_enter(&fp->f_mtx); + kf->f_offset = fp->f_offset; kf->f_rxfer = fp->f_rxfer; kf->f_rwfer = fp->f_wxfer; kf->f_seek = fp->f_seek; diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c index 37c2332537f..c334dc98a69 100644 --- a/sys/kern/vfs_syscalls.c +++ b/sys/kern/vfs_syscalls.c @@ -1,4 +1,4 @@ -/* $OpenBSD: vfs_syscalls.c,v 1.319 2019/06/19 16:55:51 deraadt Exp $ */ +/* $OpenBSD: vfs_syscalls.c,v 1.320 2019/07/10 16:43:19 anton Exp $ */ /* $NetBSD: vfs_syscalls.c,v 1.71 1996/04/23 10:29:02 mycroft Exp $ */ /* @@ -2999,6 +2999,7 @@ sys_getdents(struct proc *p, void *v, register_t *retval) struct uio auio; struct iovec aiov; size_t buflen; + off_t offset; int error, eofflag; buflen = SCARG(uap, buflen); @@ -3011,12 +3012,16 @@ sys_getdents(struct proc *p, void *v, register_t *retval) error = EBADF; goto bad; } - if (fp->f_offset < 0) { + + offset = foffset_enter(fp); + if (offset < 0) { + foffset_leave(fp, 0, FO_NOUPDATE); error = EINVAL; goto bad; } vp = fp->f_data; if (vp->v_type != VDIR) { + foffset_leave(fp, 0, FO_NOUPDATE); error = EINVAL; goto bad; } @@ -3029,10 +3034,10 @@ sys_getdents(struct proc *p, void *v, register_t *retval) auio.uio_procp = p; auio.uio_resid = buflen; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); - auio.uio_offset = fp->f_offset; + auio.uio_offset = offset; error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag); - fp->f_offset = auio.uio_offset; VOP_UNLOCK(vp); + foffset_leave(fp, auio.uio_offset, 0); if (error) goto bad; *retval = buflen - auio.uio_resid; diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c index 520126602c1..43b1dbe1c85 100644 --- a/sys/kern/vfs_vnops.c +++ b/sys/kern/vfs_vnops.c @@ -1,4 +1,4 @@ -/* $OpenBSD: vfs_vnops.c,v 1.99 2019/06/22 06:48:25 semarie Exp $ */ +/* $OpenBSD: vfs_vnops.c,v 1.100 2019/07/10 16:43:19 anton Exp $ */ /* $NetBSD: vfs_vnops.c,v 1.20 1996/02/04 02:18:41 christos Exp $ */ /* @@ -342,38 +342,35 @@ vn_read(struct file *fp, struct uio *uio, int fflags) size_t count = uio->uio_resid; off_t offset; int error; + int foflags = 0; KERNEL_LOCK(); - /* - * Check below can race. We can block on the vnode lock - * and resume with a different `fp->f_offset' value. - */ if ((fflags & FO_POSITION) == 0) - offset = fp->f_offset; + uio->uio_offset = offset = foffset_enter(fp); else offset = uio->uio_offset; /* no wrap around of offsets except on character devices */ if (vp->v_type != VCHR && count > LLONG_MAX - offset) { + foflags = FO_NOUPDATE; error = EINVAL; goto done; } if (vp->v_type == VDIR) { + foflags = FO_NOUPDATE; error = EISDIR; goto done; } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); - if ((fflags & FO_POSITION) == 0) - uio->uio_offset = fp->f_offset; error = VOP_READ(vp, uio, (fp->f_flag & FNONBLOCK) ? IO_NDELAY : 0, cred); - if ((fflags & FO_POSITION) == 0) - fp->f_offset += count - uio->uio_resid; VOP_UNLOCK(vp); done: + if ((fflags & FO_POSITION) == 0) + foffset_leave(fp, offset + (count - uio->uio_resid), foflags); KERNEL_UNLOCK(); return (error); } @@ -386,11 +383,15 @@ vn_write(struct file *fp, struct uio *uio, int fflags) { struct vnode *vp = fp->f_data; struct ucred *cred = fp->f_cred; + off_t offset; int error, ioflag = IO_UNIT; size_t count; KERNEL_LOCK(); + if ((fflags & FO_POSITION) == 0) + uio->uio_offset = offset = foffset_enter(fp); + /* note: pwrite/pwritev are unaffected by O_APPEND */ if (vp->v_type == VREG && (fp->f_flag & O_APPEND) && (fflags & FO_POSITION) == 0) @@ -401,17 +402,15 @@ vn_write(struct file *fp, struct uio *uio, int fflags) (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))) ioflag |= IO_SYNC; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); - if ((fflags & FO_POSITION) == 0) - uio->uio_offset = fp->f_offset; count = uio->uio_resid; error = VOP_WRITE(vp, uio, ioflag, cred); + VOP_UNLOCK(vp); if ((fflags & FO_POSITION) == 0) { if (ioflag & IO_APPEND) - fp->f_offset = uio->uio_offset; + foffset_leave(fp, uio->uio_offset, 0); else - fp->f_offset += count - uio->uio_resid; + foffset_leave(fp, offset + (count - uio->uio_resid), 0); } - VOP_UNLOCK(vp); KERNEL_UNLOCK(); return (error); @@ -509,7 +508,7 @@ vn_ioctl(struct file *fp, u_long com, caddr_t data, struct proc *p) error = VOP_GETATTR(vp, &vattr, p->p_ucred, p); if (error) return (error); - *(int *)data = vattr.va_size - fp->f_offset; + *(int *)data = vattr.va_size - foffset_get(fp); return (0); } if (com == FIONBIO || com == FIOASYNC) /* XXX */ @@ -601,7 +600,7 @@ vn_seek(struct file *fp, off_t *offset, int whence, struct proc *p) struct ucred *cred = p->p_ucred; struct vnode *vp = fp->f_data; struct vattr vattr; - off_t newoff; + off_t curoff, newoff; int error, special; if (vp->v_type == VFIFO) @@ -611,28 +610,35 @@ vn_seek(struct file *fp, off_t *offset, int whence, struct proc *p) else special = 0; + curoff = foffset_enter(fp); switch (whence) { case SEEK_CUR: - newoff = fp->f_offset + *offset; + newoff = curoff + *offset; break; case SEEK_END: error = VOP_GETATTR(vp, &vattr, cred, p); if (error) - return (error); + goto bad; newoff = *offset + (off_t)vattr.va_size; break; case SEEK_SET: newoff = *offset; break; default: - return (EINVAL); + error = EINVAL; + goto bad; } - if (!special) { - if (newoff < 0) - return(EINVAL); + if (!special && newoff < 0) { + error = EINVAL; + goto bad; } - fp->f_offset = *offset = newoff; + foffset_leave(fp, newoff, 0); + *offset = newoff; return (0); + +bad: + foffset_leave(fp, 0, FO_NOUPDATE); + return (error); } /* diff --git a/sys/miscfs/fuse/fuse_vnops.c b/sys/miscfs/fuse/fuse_vnops.c index c28e9efe1c7..c4351c3155b 100644 --- a/sys/miscfs/fuse/fuse_vnops.c +++ b/sys/miscfs/fuse/fuse_vnops.c @@ -1,4 +1,4 @@ -/* $OpenBSD: fuse_vnops.c,v 1.52 2018/07/18 10:47:02 helg Exp $ */ +/* $OpenBSD: fuse_vnops.c,v 1.53 2019/07/10 16:43:19 anton Exp $ */ /* * Copyright (c) 2012-2013 Sylvestre Gallon <ccna.syl@gmail.com> * @@ -168,7 +168,7 @@ filt_fusefsread(struct knote *kn, long hint) return (1); } - kn->kn_data = ip->filesize - kn->kn_fp->f_offset; + kn->kn_data = ip->filesize - foffset_get(kn->kn_fp); if (kn->kn_data == 0 && kn->kn_sfflags & NOTE_EOF) { kn->kn_fflags |= NOTE_EOF; return (1); diff --git a/sys/msdosfs/msdosfs_vnops.c b/sys/msdosfs/msdosfs_vnops.c index a2e465cbb3a..d39361ce732 100644 --- a/sys/msdosfs/msdosfs_vnops.c +++ b/sys/msdosfs/msdosfs_vnops.c @@ -1,4 +1,4 @@ -/* $OpenBSD: msdosfs_vnops.c,v 1.122 2018/06/21 14:17:23 visa Exp $ */ +/* $OpenBSD: msdosfs_vnops.c,v 1.123 2019/07/10 16:43:19 anton Exp $ */ /* $NetBSD: msdosfs_vnops.c,v 1.63 1997/10/17 11:24:19 ws Exp $ */ /*- @@ -2017,7 +2017,7 @@ filt_msdosfsread(struct knote *kn, long hint) return (1); } - kn->kn_data = dep->de_FileSize - kn->kn_fp->f_offset; + kn->kn_data = dep->de_FileSize - foffset_get(kn->kn_fp); if (kn->kn_data == 0 && kn->kn_sfflags & NOTE_EOF) { kn->kn_fflags |= NOTE_EOF; return (1); diff --git a/sys/nfs/nfs_kq.c b/sys/nfs/nfs_kq.c index b6b227c52a7..459e40907a7 100644 --- a/sys/nfs/nfs_kq.c +++ b/sys/nfs/nfs_kq.c @@ -1,4 +1,4 @@ -/* $OpenBSD: nfs_kq.c,v 1.22 2014/11/15 00:03:12 tedu Exp $ */ +/* $OpenBSD: nfs_kq.c,v 1.23 2019/07/10 16:43:20 anton Exp $ */ /* $NetBSD: nfs_kq.c,v 1.7 2003/10/30 01:43:10 simonb Exp $ */ /*- @@ -226,7 +226,7 @@ filt_nfsread(struct knote *kn, long hint) return (1); } - kn->kn_data = np->n_size - kn->kn_fp->f_offset; + kn->kn_data = np->n_size - foffset_get(kn->kn_fp); #ifdef DEBUG printf("nfsread event. %lld\n", kn->kn_data); #endif diff --git a/sys/sys/file.h b/sys/sys/file.h index 0054c6df76b..7c5e3aa0468 100644 --- a/sys/sys/file.h +++ b/sys/sys/file.h @@ -1,4 +1,4 @@ -/* $OpenBSD: file.h,v 1.54 2019/06/22 06:48:25 semarie Exp $ */ +/* $OpenBSD: file.h,v 1.55 2019/07/10 16:43:20 anton Exp $ */ /* $NetBSD: file.h,v 1.11 1995/03/26 20:24:13 jtc Exp $ */ /* @@ -64,7 +64,7 @@ struct fileops { int (*fo_seek)(struct file *, off_t *, int, struct proc *); }; #define FO_POSITION 0x01 /* positioned read/write */ - +#define FO_NOUPDATE 0x00000002 /* don't update file offset */ /* * Kernel descriptor table. @@ -90,8 +90,9 @@ struct file { u_int f_count; /* [a] reference count */ struct ucred *f_cred; /* [I] credentials associated with descriptor */ struct fileops *f_ops; /* [I] file operation pointers */ - off_t f_offset; /* [k] */ + off_t f_offset; /* [f] */ void *f_data; /* [I] private data */ + u_int f_olock; /* [f] offset lock */ int f_iflags; /* [k] internal flags */ uint64_t f_rxfer; /* [f] total number of read transfers */ uint64_t f_wxfer; /* [f] total number of write transfers */ @@ -117,6 +118,13 @@ struct file { int fdrop(struct file *, struct proc *); +off_t foffset_get(struct file *); +off_t foffset_enter(struct file *); +void foffset_leave(struct file *, off_t, int); + +#define FOL_NWAIT 0x7fffffffu /* number of waiters */ +#define FOL_LOCKED 0x80000000u /* file offset is locked */ + LIST_HEAD(filelist, file); extern int maxfiles; /* kernel limit on number of open files */ extern int numfiles; /* actual number of open files */ diff --git a/sys/tmpfs/tmpfs_vnops.c b/sys/tmpfs/tmpfs_vnops.c index 8b5c7a76b46..69a814ae26a 100644 --- a/sys/tmpfs/tmpfs_vnops.c +++ b/sys/tmpfs/tmpfs_vnops.c @@ -1,4 +1,4 @@ -/* $OpenBSD: tmpfs_vnops.c,v 1.33 2018/10/22 17:31:25 krw Exp $ */ +/* $OpenBSD: tmpfs_vnops.c,v 1.34 2019/07/10 16:43:20 anton Exp $ */ /* $NetBSD: tmpfs_vnops.c,v 1.100 2012/11/05 17:27:39 dholland Exp $ */ /* @@ -2645,7 +2645,7 @@ filt_tmpfsread(struct knote *kn, long hint) return (1); } - kn->kn_data = node->tn_size - kn->kn_fp->f_offset; + kn->kn_data = node->tn_size - foffset_get(kn->kn_fp); if (kn->kn_data == 0 && kn->kn_sfflags & NOTE_EOF) { kn->kn_fflags |= NOTE_EOF; return (1); diff --git a/sys/ufs/ufs/ufs_vnops.c b/sys/ufs/ufs/ufs_vnops.c index 82ebd356af0..224a37250a2 100644 --- a/sys/ufs/ufs/ufs_vnops.c +++ b/sys/ufs/ufs/ufs_vnops.c @@ -1,4 +1,4 @@ -/* $OpenBSD: ufs_vnops.c,v 1.143 2018/12/23 10:46:51 natano Exp $ */ +/* $OpenBSD: ufs_vnops.c,v 1.144 2019/07/10 16:43:20 anton Exp $ */ /* $NetBSD: ufs_vnops.c,v 1.18 1996/05/11 18:28:04 mycroft Exp $ */ /* @@ -1952,10 +1952,10 @@ filt_ufsread(struct knote *kn, long hint) #ifdef EXT2FS if (IS_EXT2_VNODE(ip->i_vnode)) - kn->kn_data = ext2fs_size(ip) - kn->kn_fp->f_offset; + kn->kn_data = ext2fs_size(ip) - foffset_get(kn->kn_fp); else #endif - kn->kn_data = DIP(ip, size) - kn->kn_fp->f_offset; + kn->kn_data = DIP(ip, size) - foffset_get(kn->kn_fp); if (kn->kn_data == 0 && kn->kn_sfflags & NOTE_EOF) { kn->kn_fflags |= NOTE_EOF; return (1); |