diff options
author | anton <anton@cvs.openbsd.org> | 2019-01-21 18:09:22 +0000 |
---|---|---|
committer | anton <anton@cvs.openbsd.org> | 2019-01-21 18:09:22 +0000 |
commit | bc327c97b94e27819444bf325d33c2b5e5708a4f (patch) | |
tree | 97cc661b5437aac51b4bad9155bb1d3ce02530f5 | |
parent | b1b40e2217c9f88f38bcc644233882a46076587f (diff) |
Introduce a dedicated entry point data structure for file locks. This new data
structure allows for better tracking of pending lock operations which is
essential in order to prevent a use-after-free once the underlying vnode is
gone.
Inspired by the lockf implementation in FreeBSD.
ok visa@
Reported-by: syzbot+d5540a236382f50f1dac@syzkaller.appspotmail.com
-rw-r--r-- | sys/kern/vfs_lockf.c | 162 | ||||
-rw-r--r-- | sys/kern/vfs_subr.c | 4 | ||||
-rw-r--r-- | sys/msdosfs/denode.h | 4 | ||||
-rw-r--r-- | sys/nfs/nfsnode.h | 4 | ||||
-rw-r--r-- | sys/sys/fcntl.h | 3 | ||||
-rw-r--r-- | sys/sys/lockf.h | 16 | ||||
-rw-r--r-- | sys/sys/specdev.h | 4 | ||||
-rw-r--r-- | sys/tmpfs/tmpfs.h | 4 | ||||
-rw-r--r-- | sys/ufs/ufs/inode.h | 4 |
9 files changed, 153 insertions, 52 deletions
diff --git a/sys/kern/vfs_lockf.c b/sys/kern/vfs_lockf.c index 5ee98ddd38c..7fa6d9248c7 100644 --- a/sys/kern/vfs_lockf.c +++ b/sys/kern/vfs_lockf.c @@ -1,4 +1,4 @@ -/* $OpenBSD: vfs_lockf.c,v 1.31 2018/11/10 21:21:15 anton Exp $ */ +/* $OpenBSD: vfs_lockf.c,v 1.32 2019/01/21 18:09:21 anton Exp $ */ /* $NetBSD: vfs_lockf.c,v 1.7 1996/02/04 02:18:21 christos Exp $ */ /* @@ -45,7 +45,8 @@ #include <sys/lockf.h> #include <sys/unistd.h> -struct pool lockfpool; +struct pool lockf_state_pool; +struct pool lockf_pool; /* * This variable controls the maximum number of processes that will @@ -75,16 +76,42 @@ int lockf_debug = DEBUG_SETLOCK|DEBUG_CLEARLOCK|DEBUG_WAKELOCK; #define LFPRINT(args, level) #endif +void ls_ref(struct lockf_state *); +void ls_rele(struct lockf_state *); + void lf_init(void) { - pool_init(&lockfpool, sizeof(struct lockf), 0, IPL_NONE, PR_WAITOK, - "lockfpl", NULL); + pool_init(&lockf_state_pool, sizeof(struct lockf_state), 0, IPL_NONE, + PR_WAITOK, "lockfspl", NULL); + pool_init(&lockf_pool, sizeof(struct lockf), 0, IPL_NONE, + PR_WAITOK, "lockfpl", NULL); } struct lockf *lf_alloc(uid_t, int); void lf_free(struct lockf *); +void +ls_ref(struct lockf_state *ls) +{ + ls->ls_refs++; +} + +void +ls_rele(struct lockf_state *ls) +{ + if (--ls->ls_refs > 0) + return; + +#ifdef LOCKF_DIAGNOSTIC + KASSERT(ls->ls_head == NULL); + KASSERT(ls->ls_pending == 0); +#endif + + *ls->ls_owner = NULL; + pool_put(&lockf_state_pool, ls); +} + /* * We enforce a limit on locks by uid, so that a single user cannot * run the kernel out of memory. For now, the limit is pretty coarse. @@ -115,7 +142,7 @@ lf_alloc(uid_t uid, int allowfail) } uip->ui_lockcnt++; uid_release(uip); - lock = pool_get(&lockfpool, PR_WAITOK); + lock = pool_get(&lockf_pool, PR_WAITOK); lock->lf_uid = uid; return (lock); } @@ -127,14 +154,14 @@ lf_free(struct lockf *lock) LFPRINT(("lf_free", lock), DEBUG_LINK); - if (*lock->lf_head == lock) { + if (lock->lf_state->ls_head == lock) { LFPRINT(("lf_free: head", lock->lf_next), DEBUG_LINK); #ifdef LOCKF_DIAGNOSTIC KASSERT(lock->lf_prev == NULL); #endif /* LOCKF_DIAGNOSTIC */ - *lock->lf_head = lock->lf_next; + lock->lf_state->ls_head = lock->lf_next; } #ifdef LOCKF_DIAGNOSTIC @@ -142,11 +169,12 @@ lf_free(struct lockf *lock) #endif /* LOCKF_DIAGNOSTIC */ lf_unlink(lock); + ls_rele(lock->lf_state); uip = uid_find(lock->lf_uid); uip->ui_lockcnt--; uid_release(uip); - pool_put(&lockfpool, lock); + pool_put(&lockf_pool, lock); } void @@ -176,18 +204,14 @@ lf_link(struct lockf *lock1, struct lockf *lock2) } lock2->lf_prev = lock1; - if (*lock1->lf_head == NULL) { + if (lock1->lf_state->ls_head == NULL) { LFPRINT(("lf_link: head", lock1), DEBUG_LINK); -#ifdef LOCKF_DIAGNOSTIC - KASSERT(*lock2->lf_head == NULL); -#endif /* LOCKF_DIAGNOSTIC */ - - *lock1->lf_head = lock1; - } else if (*lock2->lf_head == lock2) { + lock1->lf_state->ls_head = lock1; + } else if (lock1->lf_state->ls_head == lock2) { LFPRINT(("lf_link: swap head", lock1), DEBUG_LINK); - *lock1->lf_head = lock1; + lock1->lf_state->ls_head = lock1; } } @@ -207,10 +231,11 @@ lf_unlink(struct lockf *lock) * Do an advisory lock operation. */ int -lf_advlock(struct lockf **head, off_t size, caddr_t id, int op, +lf_advlock(struct lockf_state **state, off_t size, caddr_t id, int op, struct flock *fl, int flags) { struct proc *p = curproc; + struct lockf_state *ls; struct lockf *lock; off_t start, end; int error; @@ -248,23 +273,39 @@ lf_advlock(struct lockf **head, off_t size, caddr_t id, int op, end = -1; } + if (*state == NULL) { + ls = pool_get(&lockf_state_pool, PR_WAITOK | PR_ZERO); + /* pool_get() can sleep, ensure the race was won. */ + if (*state == NULL) { + *state = ls; + ls->ls_owner = state; + } else { + pool_put(&lockf_state_pool, ls); + } + } + ls = *state; + ls_ref(ls); + /* * Avoid the common case of unlocking when inode has no locks. */ - if (*head == NULL) { + if (ls->ls_head == NULL) { if (op != F_SETLK) { + ls_rele(ls); fl->l_type = F_UNLCK; return (0); } } lock = lf_alloc(p->p_ucred->cr_uid, op == F_SETLK ? 1 : 2); - if (!lock) + if (!lock) { + ls_rele(ls); return (ENOLCK); + } lock->lf_start = start; lock->lf_end = end; lock->lf_id = id; - lock->lf_head = head; + lock->lf_state = ls; lock->lf_type = fl->l_type; lock->lf_prev = NULL; lock->lf_next = NULL; @@ -297,7 +338,6 @@ int lf_setlock(struct lockf *lock) { struct lockf *block; - struct lockf **head = lock->lf_head; struct lockf *prev, *overlap, *ltmp; static char lockstr[] = "lockf"; int ovcase, priority, needtolink, error; @@ -369,7 +409,10 @@ lf_setlock(struct lockf *lock) LFPRINT(("lf_setlock", lock), DEBUG_SETLOCK); LFPRINT(("lf_setlock: blocking on", block), DEBUG_SETLOCK); TAILQ_INSERT_TAIL(&block->lf_blkhd, lock, lf_block); + lock->lf_state->ls_pending++; error = tsleep(lock, priority, lockstr, 0); + lock->lf_state->ls_pending--; + wakeup_one(lock->lf_state); if (lock->lf_next != NULL) { TAILQ_REMOVE(&lock->lf_next->lf_blkhd, lock, lf_block); lock->lf_next = NULL; @@ -378,6 +421,10 @@ lf_setlock(struct lockf *lock) lf_free(lock); return (error); } + if (lock->lf_flags & F_INTR) { + lf_free(lock); + return (EINTR); + } } /* * No blocks!! Add the lock. Note that we will @@ -387,7 +434,7 @@ lf_setlock(struct lockf *lock) * Skip over locks owned by other processes. * Handle any locks that overlap and are owned by ourselves. */ - block = *head; + block = lock->lf_state->ls_head; prev = NULL; overlap = NULL; needtolink = 1; @@ -411,8 +458,12 @@ lf_setlock(struct lockf *lock) lf_link(lock, overlap); else if (prev) /* last lock in list */ lf_link(prev, lock); - else /* first lock in list */ - *head = lock; + else { /* first lock in list */ +#ifdef LOCKF_DIAGNOSTIC + KASSERT(lock->lf_state->ls_head == NULL); +#endif + lock->lf_state->ls_head = lock; + } } break; case 1: /* overlap == lock */ @@ -422,7 +473,7 @@ lf_setlock(struct lockf *lock) */ if (lock->lf_type == F_RDLCK && overlap->lf_type == F_WRLCK) - lf_wakelock(overlap); + lf_wakelock(overlap, 0); overlap->lf_type = lock->lf_type; lf_free(lock); lock = overlap; /* for debug output below */ @@ -443,7 +494,7 @@ lf_setlock(struct lockf *lock) overlap->lf_start = lock->lf_end + 1; } else lf_split(overlap, lock); - lf_wakelock(overlap); + lf_wakelock(overlap, 0); break; case 3: /* lock contains overlap */ /* @@ -452,7 +503,7 @@ lf_setlock(struct lockf *lock) */ if (lock->lf_type == F_RDLCK && overlap->lf_type == F_WRLCK) { - lf_wakelock(overlap); + lf_wakelock(overlap, 0); } else { while ((ltmp = TAILQ_FIRST(&overlap->lf_blkhd))) { @@ -480,7 +531,7 @@ lf_setlock(struct lockf *lock) lf_unlink(lock); lf_link(overlap, lock); overlap->lf_end = lock->lf_start - 1; - lf_wakelock(overlap); + lf_wakelock(overlap, 0); needtolink = 0; continue; case 5: /* overlap ends after lock */ @@ -490,7 +541,7 @@ lf_setlock(struct lockf *lock) if (needtolink) lf_link(lock, overlap); overlap->lf_start = lock->lf_end + 1; - lf_wakelock(overlap); + lf_wakelock(overlap, 0); break; } break; @@ -508,8 +559,7 @@ lf_setlock(struct lockf *lock) int lf_clearlock(struct lockf *lock) { - struct lockf **head = lock->lf_head; - struct lockf *lf = *head; + struct lockf *lf = lock->lf_state->ls_head; struct lockf *overlap, *prev; int ovcase; @@ -517,7 +567,7 @@ lf_clearlock(struct lockf *lock) return (0); LFPRINT(("lf_clearlock", lock), DEBUG_CLEARLOCK); while ((ovcase = lf_findoverlap(lf, lock, SELF, &prev, &overlap))) { - lf_wakelock(overlap); + lf_wakelock(overlap, 0); switch (ovcase) { case 1: /* overlap == lock */ @@ -582,7 +632,7 @@ lf_getblock(struct lockf *lock) { struct lockf *prev, *overlap, *lf; - lf = *lock->lf_head; + lf = lock->lf_state->ls_head; while (lf_findoverlap(lf, lock, OTHERS, &prev, &overlap) != 0) { /* * We've found an overlap, see if it blocks us @@ -682,6 +732,43 @@ lf_findoverlap(struct lockf *lf, struct lockf *lock, int type, } /* + * Purge all locks associated with the given lock state. + */ +void +lf_purgelocks(struct lockf_state *ls) +{ + struct lockf *lock; + + if (ls == NULL) + return; + + ls_ref(ls); + + /* Interrupt blocked locks and wait for all of them to finish. */ + for (lock = ls->ls_head; lock != NULL; lock = lock->lf_next) { + LFPRINT(("lf_purgelocks: wakeup", lock), DEBUG_SETLOCK); + lf_wakelock(lock, F_INTR); + } + while (ls->ls_pending > 0) { + tsleep(ls, PLOCK, "lockfp", 0); + } + + /* + * Any remaining locks cannot block other locks at this point and can + * safely be removed. + */ + while ((lock = ls->ls_head) != NULL) { + lf_free(lock); + } + + /* This is the last expected thread to hold a lock state reference. */ +#ifdef LOCKF_DIAGNOSTIC + KASSERT(ls->ls_refs == 1); +#endif + ls_rele(ls); +} + +/* * Split a lock and a contained region into * two or three locks as necessary. */ @@ -712,6 +799,7 @@ lf_split(struct lockf *lock1, struct lockf *lock2) */ splitlock = lf_alloc(lock1->lf_uid, 0); memcpy(splitlock, lock1, sizeof(*splitlock)); + ls_ref(splitlock->lf_state); splitlock->lf_prev = NULL; splitlock->lf_next = NULL; splitlock->lf_start = lock2->lf_end + 1; @@ -727,13 +815,14 @@ lf_split(struct lockf *lock1, struct lockf *lock2) * Wakeup a blocklist */ void -lf_wakelock(struct lockf *lock) +lf_wakelock(struct lockf *lock, int flags) { struct lockf *wakelock; while ((wakelock = TAILQ_FIRST(&lock->lf_blkhd))) { TAILQ_REMOVE(&lock->lf_blkhd, wakelock, lf_block); wakelock->lf_next = NULL; + wakelock->lf_flags |= flags; wakeup_one(wakelock); } } @@ -764,7 +853,8 @@ lf_print(char *tag, struct lockf *lock) lock->lf_type == F_WRLCK ? "exclusive" : lock->lf_type == F_UNLCK ? "unlock" : "unknown", lock->lf_start, lock->lf_end); - printf(", prev %p, next %p", lock->lf_prev, lock->lf_next); + printf(", prev %p, next %p, state %p", + lock->lf_prev, lock->lf_next, lock->lf_state); block = TAILQ_FIRST(&lock->lf_blkhd); if (block) printf(", block"); @@ -782,7 +872,7 @@ lf_printlist(char *tag, struct lockf *lock) #endif /* LOCKF_DIAGNOSTIC */ printf("%s: Lock list:\n", tag); - for (lf = *lock->lf_head; lf; lf = lf->lf_next) { + for (lf = lock->lf_state->ls_head; lf; lf = lf->lf_next) { if (lock == lf) printf(" * "); else diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index 4e888b98f27..a15481c73f9 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -1,4 +1,4 @@ -/* $OpenBSD: vfs_subr.c,v 1.284 2018/12/23 10:46:51 natano Exp $ */ +/* $OpenBSD: vfs_subr.c,v 1.285 2019/01/21 18:09:21 anton Exp $ */ /* $NetBSD: vfs_subr.c,v 1.53 1996/04/22 01:39:13 christos Exp $ */ /* @@ -52,6 +52,7 @@ #include <sys/conf.h> #include <sys/vnode.h> #include <sys/lock.h> +#include <sys/lockf.h> #include <sys/stat.h> #include <sys/acct.h> #include <sys/namei.h> @@ -1150,6 +1151,7 @@ vgonel(struct vnode *vp, struct proc *p) vx->v_flag &= ~VALIASED; vp->v_flag &= ~VALIASED; } + lf_purgelocks(vp->v_speclockf); free(vp->v_specinfo, M_VNODE, sizeof(struct specinfo)); vp->v_specinfo = NULL; } diff --git a/sys/msdosfs/denode.h b/sys/msdosfs/denode.h index 5fe1d8862cd..3a3f629df52 100644 --- a/sys/msdosfs/denode.h +++ b/sys/msdosfs/denode.h @@ -1,4 +1,4 @@ -/* $OpenBSD: denode.h,v 1.33 2018/05/07 14:43:01 mpi Exp $ */ +/* $OpenBSD: denode.h,v 1.34 2019/01/21 18:09:21 anton Exp $ */ /* $NetBSD: denode.h,v 1.24 1997/10/17 11:23:39 ws Exp $ */ /*- @@ -148,7 +148,7 @@ struct denode { int de_fndcnt; /* number of slots before de_fndoffset */ long de_refcnt; /* reference count */ struct msdosfsmount *de_pmp; /* addr of our mount struct */ - struct lockf *de_lockf; /* byte level lock list */ + struct lockf_state *de_lockf; /* byte level lock list */ struct rrwlock de_lock; /* denode lock */ u_char de_Name[11]; /* name, from DOS directory entry */ u_char de_Attributes; /* attributes, from directory entry */ diff --git a/sys/nfs/nfsnode.h b/sys/nfs/nfsnode.h index 6a074a686db..1348060595f 100644 --- a/sys/nfs/nfsnode.h +++ b/sys/nfs/nfsnode.h @@ -1,4 +1,4 @@ -/* $OpenBSD: nfsnode.h,v 1.40 2018/05/05 11:54:11 mpi Exp $ */ +/* $OpenBSD: nfsnode.h,v 1.41 2019/01/21 18:09:21 anton Exp $ */ /* $NetBSD: nfsnode.h,v 1.16 1996/02/18 11:54:04 fvdl Exp $ */ /* @@ -78,7 +78,7 @@ struct nfsnode { time_t n_ctime; /* Prev create time. */ nfsfh_t *n_fhp; /* NFS File Handle */ struct vnode *n_vnode; /* associated vnode */ - struct lockf *n_lockf; /* Locking record of file */ + struct lockf_state *n_lockf; /* Locking record of file */ struct rrwlock n_lock; /* NFSnode lock */ int n_error; /* Save write error value */ union { diff --git a/sys/sys/fcntl.h b/sys/sys/fcntl.h index b08dcc72ff8..e964ea49dde 100644 --- a/sys/sys/fcntl.h +++ b/sys/sys/fcntl.h @@ -1,4 +1,4 @@ -/* $OpenBSD: fcntl.h,v 1.21 2015/05/17 01:22:01 deraadt Exp $ */ +/* $OpenBSD: fcntl.h,v 1.22 2019/01/21 18:09:21 anton Exp $ */ /* $NetBSD: fcntl.h,v 1.8 1995/03/26 20:24:12 jtc Exp $ */ /*- @@ -170,6 +170,7 @@ #define F_WAIT 0x010 /* Wait until lock is granted */ #define F_FLOCK 0x020 /* Use flock(2) semantics for lock */ #define F_POSIX 0x040 /* Use POSIX semantics for lock */ +#define F_INTR 0x080 /* Lock operation interrupted */ #endif /* diff --git a/sys/sys/lockf.h b/sys/sys/lockf.h index a822bcd32aa..90212a32e1e 100644 --- a/sys/sys/lockf.h +++ b/sys/sys/lockf.h @@ -1,4 +1,4 @@ -/* $OpenBSD: lockf.h,v 1.12 2018/11/02 07:15:03 anton Exp $ */ +/* $OpenBSD: lockf.h,v 1.13 2019/01/21 18:09:21 anton Exp $ */ /* $NetBSD: lockf.h,v 1.5 1994/06/29 06:44:33 cgd Exp $ */ /* @@ -49,7 +49,7 @@ struct lockf { off_t lf_start; /* The byte # of the start of the lock */ off_t lf_end; /* The byte # of the end of the lock (-1=EOF)*/ caddr_t lf_id; /* The id of the resource holding the lock */ - struct lockf **lf_head; /* Back pointer to the head of lockf list */ + struct lockf_state *lf_state; /* State associated with the lock */ struct lockf *lf_prev; /* A pointer to the previous lock on this inode */ struct lockf *lf_next; /* A pointer to the next lock on this inode */ struct locklist lf_blkhd; /* The list of blocked locks */ @@ -58,13 +58,20 @@ struct lockf { pid_t lf_pid; /* POSIX - owner pid */ }; +struct lockf_state { + struct lockf_state **ls_owner; /* owner */ + struct lockf *ls_head; /* linked list of locks */ + int ls_refs; /* reference counter */ + int ls_pending; /* pending lock operations */ +}; + /* Maximum length of sleep chains to traverse to try and detect deadlock. */ #define MAXDEPTH 50 #ifdef _KERNEL __BEGIN_DECLS void lf_init(void); -int lf_advlock(struct lockf **, +int lf_advlock(struct lockf_state **, off_t, caddr_t, int, struct flock *, int); int lf_clearlock(struct lockf *); int lf_findoverlap(struct lockf *, @@ -73,8 +80,9 @@ struct lockf * lf_getblock(struct lockf *); int lf_getlock(struct lockf *, struct flock *); int lf_setlock(struct lockf *); +void lf_purgelocks(struct lockf_state *); void lf_split(struct lockf *, struct lockf *); -void lf_wakelock(struct lockf *); +void lf_wakelock(struct lockf *, int); void lf_link(struct lockf *, struct lockf *); void lf_unlink(struct lockf *); __END_DECLS diff --git a/sys/sys/specdev.h b/sys/sys/specdev.h index c85e95d14db..870aec0d324 100644 --- a/sys/sys/specdev.h +++ b/sys/sys/specdev.h @@ -1,4 +1,4 @@ -/* $OpenBSD: specdev.h,v 1.37 2016/04/05 19:26:15 natano Exp $ */ +/* $OpenBSD: specdev.h,v 1.38 2019/01/21 18:09:21 anton Exp $ */ /* $NetBSD: specdev.h,v 1.12 1996/02/13 13:13:01 mycroft Exp $ */ /* @@ -42,7 +42,7 @@ struct specinfo { struct vnode *si_specnext; struct mount *si_mountpoint; dev_t si_rdev; - struct lockf *si_lockf; + struct lockf_state *si_lockf; daddr_t si_lastr; union { struct vnode *ci_parent; /* pointer back to parent device */ diff --git a/sys/tmpfs/tmpfs.h b/sys/tmpfs/tmpfs.h index 8646ca36446..0c5b1583327 100644 --- a/sys/tmpfs/tmpfs.h +++ b/sys/tmpfs/tmpfs.h @@ -1,4 +1,4 @@ -/* $OpenBSD: tmpfs.h,v 1.8 2016/06/19 11:54:33 natano Exp $ */ +/* $OpenBSD: tmpfs.h,v 1.9 2019/01/21 18:09:21 anton Exp $ */ /* $NetBSD: tmpfs.h,v 1.45 2011/09/27 01:10:43 christos Exp $ */ /* @@ -122,7 +122,7 @@ typedef struct tmpfs_node { struct timespec tn_birthtime; /* Head of byte-level lock list (used by tmpfs_advlock). */ - struct lockf * tn_lockf; + struct lockf_state * tn_lockf; union { /* Type case: VBLK or VCHR. */ diff --git a/sys/ufs/ufs/inode.h b/sys/ufs/ufs/inode.h index 3f2c7081d3c..4675dc7e38d 100644 --- a/sys/ufs/ufs/inode.h +++ b/sys/ufs/ufs/inode.h @@ -1,4 +1,4 @@ -/* $OpenBSD: inode.h,v 1.50 2016/06/19 11:54:34 natano Exp $ */ +/* $OpenBSD: inode.h,v 1.51 2019/01/21 18:09:21 anton Exp $ */ /* $NetBSD: inode.h,v 1.8 1995/06/15 23:22:50 cgd Exp $ */ /* @@ -85,7 +85,7 @@ struct inode { struct cluster_info i_ci; struct dquot *i_dquot[MAXQUOTAS]; /* Dquot structures. */ u_quad_t i_modrev; /* Revision level for NFS lease. */ - struct lockf *i_lockf;/* Head of byte-level lock list. */ + struct lockf_state *i_lockf; /* Byte-level lock state. */ struct rrwlock i_lock;/* Inode lock */ /* |