diff options
author | Constantine Sapuntzakis <csapuntz@cvs.openbsd.org> | 2001-02-21 23:24:33 +0000 |
---|---|---|
committer | Constantine Sapuntzakis <csapuntz@cvs.openbsd.org> | 2001-02-21 23:24:33 +0000 |
commit | 33e6fbe33f4ec84f10016e81c87c7be89171378b (patch) | |
tree | 122cb8b58569496544bc77d618dec5e995a23b94 /sys | |
parent | d0a302227eeedfb62540337fcbc0741756591d7c (diff) |
Latest soft updates from FreeBSD/Kirk McKusick
Snapshot-related code has been commented out.
Diffstat (limited to 'sys')
-rw-r--r-- | sys/kern/kern_malloc.c | 9 | ||||
-rw-r--r-- | sys/kern/vfs_bio.c | 16 | ||||
-rw-r--r-- | sys/kern/vfs_cluster.c | 6 | ||||
-rw-r--r-- | sys/kern/vfs_subr.c | 16 | ||||
-rw-r--r-- | sys/kern/vfs_sync.c | 35 | ||||
-rw-r--r-- | sys/kern/vfs_syscalls.c | 9 | ||||
-rw-r--r-- | sys/sys/buf.h | 48 | ||||
-rw-r--r-- | sys/sys/malloc.h | 4 | ||||
-rw-r--r-- | sys/sys/vnode.h | 7 | ||||
-rw-r--r-- | sys/ufs/ffs/ffs_alloc.c | 6 | ||||
-rw-r--r-- | sys/ufs/ffs/ffs_extern.h | 22 | ||||
-rw-r--r-- | sys/ufs/ffs/ffs_inode.c | 8 | ||||
-rw-r--r-- | sys/ufs/ffs/ffs_softdep.c | 1532 | ||||
-rw-r--r-- | sys/ufs/ffs/ffs_softdep_stub.c | 57 | ||||
-rw-r--r-- | sys/ufs/ffs/ffs_vfsops.c | 11 | ||||
-rw-r--r-- | sys/ufs/ffs/softdep.h | 40 | ||||
-rw-r--r-- | sys/ufs/ufs/inode.h | 3 | ||||
-rw-r--r-- | sys/ufs/ufs/ufs_extern.h | 5 | ||||
-rw-r--r-- | sys/ufs/ufs/ufs_lookup.c | 48 | ||||
-rw-r--r-- | sys/ufs/ufs/ufs_vnops.c | 106 |
20 files changed, 1266 insertions, 722 deletions
diff --git a/sys/kern/kern_malloc.c b/sys/kern/kern_malloc.c index d52ea29ef5e..6c2c7f3c046 100644 --- a/sys/kern/kern_malloc.c +++ b/sys/kern/kern_malloc.c @@ -1,4 +1,4 @@ -/* $OpenBSD: kern_malloc.c,v 1.23 2001/02/20 23:35:35 csapuntz Exp $ */ +/* $OpenBSD: kern_malloc.c,v 1.24 2001/02/21 23:24:29 csapuntz Exp $ */ /* $NetBSD: kern_malloc.c,v 1.15.4.2 1996/06/13 17:10:56 cgd Exp $ */ /* @@ -135,11 +135,8 @@ malloc(size, type, flags) #endif #ifdef MALLOC_DEBUG - if (debug_malloc(size, type, flags, (void **)&va)) { - if ((flags & M_ZERO) && va != NULL) - bzero(va, size); + if (debug_malloc(size, type, flags, (void **)&va)) return ((void *) va); - } #endif indx = BUCKETINDX(size); @@ -312,8 +309,6 @@ out: out: #endif splx(s); - if ((flags & M_ZERO) && va != NULL) - bzero(va, size); return ((void *) va); } diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index 3021d4bbd0d..eae71abafb5 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -1,4 +1,4 @@ -/* $OpenBSD: vfs_bio.c,v 1.28 2001/02/13 19:51:49 art Exp $ */ +/* $OpenBSD: vfs_bio.c,v 1.29 2001/02/21 23:24:30 csapuntz Exp $ */ /* $NetBSD: vfs_bio.c,v 1.44 1996/06/11 11:15:36 pk Exp $ */ /*- @@ -475,9 +475,9 @@ brelse(bp) * If it's invalid or empty, dissociate it from its vnode * and put on the head of the appropriate queue. */ - if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate) { - (*bioops.io_deallocate)(bp); - } + if (LIST_FIRST(&bp->b_dep) != NULL) + buf_deallocate(bp); + CLR(bp->b_flags, B_DELWRI); if (bp->b_vp) { reassignbuf(bp, bp->b_vp); @@ -787,8 +787,8 @@ start: splx(s); - if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate) - (*bioops.io_deallocate)(bp); + if (LIST_FIRST(&bp->b_dep) != NULL) + buf_deallocate(bp); /* clear out various other fields */ bp->b_flags = B_BUSY; @@ -866,8 +866,8 @@ biodone(bp) panic("biodone already"); SET(bp->b_flags, B_DONE); /* note that it's done */ - if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_complete) - (*bioops.io_complete)(bp); + if (LIST_FIRST(&bp->b_dep) != NULL) + buf_complete(bp); if (!ISSET(bp->b_flags, B_READ)) /* wake up reader */ vwakeup(bp); diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c index 0c433c72b83..1839e585f0f 100644 --- a/sys/kern/vfs_cluster.c +++ b/sys/kern/vfs_cluster.c @@ -1,4 +1,4 @@ -/* $OpenBSD: vfs_cluster.c,v 1.17 2000/06/23 02:14:38 mickey Exp $ */ +/* $OpenBSD: vfs_cluster.c,v 1.18 2001/02/21 23:24:30 csapuntz Exp $ */ /* $NetBSD: vfs_cluster.c,v 1.12 1996/04/22 01:39:05 christos Exp $ */ /*- @@ -703,8 +703,8 @@ redo: tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); tbp->b_flags |= (B_ASYNC | B_AGE); - if (LIST_FIRST(&tbp->b_dep) != NULL && bioops.io_start) - (*bioops.io_start)(tbp); + if (LIST_FIRST(&tbp->b_dep) != NULL) + buf_start(tbp); pagemove(tbp->b_data, cp, size); bp->b_bcount += size; diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index 0cb3e61cc4b..bee6b56c1ae 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -1,4 +1,4 @@ -/* $OpenBSD: vfs_subr.c,v 1.48 2001/02/08 00:32:11 mickey Exp $ */ +/* $OpenBSD: vfs_subr.c,v 1.49 2001/02/21 23:24:30 csapuntz Exp $ */ /* $NetBSD: vfs_subr.c,v 1.53 1996/04/22 01:39:13 christos Exp $ */ /* @@ -2209,3 +2209,17 @@ vfs_unregister(vfs) return 0; } + +/* + * Check if vnode represents a disk device + */ +int +vn_isdisk(vp, errp) + struct vnode *vp; + int *errp; +{ + if (vp->v_type != VBLK && vp->v_type != VCHR) + return (0); + + return (1); +} diff --git a/sys/kern/vfs_sync.c b/sys/kern/vfs_sync.c index 128866b8f69..d5c9fddf418 100644 --- a/sys/kern/vfs_sync.c +++ b/sys/kern/vfs_sync.c @@ -1,4 +1,4 @@ -/* $OpenBSD: vfs_sync.c,v 1.12 2000/03/23 15:57:33 art Exp $ */ +/* $OpenBSD: vfs_sync.c,v 1.13 2001/02/21 23:24:30 csapuntz Exp $ */ /* * Portions of this code are: @@ -55,6 +55,10 @@ #include <sys/kernel.h> +#ifdef FFS_SOFTUPDATES +int softdep_process_worklist __P((struct mount *)); +#endif + /* * The workitem queue. */ @@ -67,7 +71,7 @@ int rushjob = 0; /* number of slots to run ASAP */ int stat_rush_requests = 0; /* number of rush requests */ static int syncer_delayno = 0; -static long syncer_last; +static long syncer_mask; LIST_HEAD(synclist, vnode); static struct synclist *syncer_workitem_pending; @@ -105,16 +109,9 @@ void vn_initialize_syncerd() { - int i; - - syncer_last = SYNCER_MAXDELAY + 2; - - syncer_workitem_pending = - malloc(syncer_last * sizeof(struct synclist), - M_VNODE, M_WAITOK); - - for (i = 0; i < syncer_last; i++) - LIST_INIT(&syncer_workitem_pending[i]); + syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, M_WAITOK, + &syncer_mask); + syncer_maxdelay = syncer_mask + 1; } /* @@ -132,9 +129,10 @@ vn_syncer_add_to_worklist(vp, delay) if (vp->v_flag & VONSYNCLIST) LIST_REMOVE(vp, v_synclist); - if (delay > syncer_maxdelay) - delay = syncer_maxdelay; - slot = (syncer_delayno + delay) % syncer_last; + if (delay > syncer_maxdelay - 2) + delay = syncer_maxdelay - 2; + slot = (syncer_delayno + delay) & syncer_mask; + LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist); vp->v_flag |= VONSYNCLIST; splx(s); @@ -164,7 +162,7 @@ sched_sync(p) s = splbio(); slp = &syncer_workitem_pending[syncer_delayno]; syncer_delayno += 1; - if (syncer_delayno >= syncer_last) + if (syncer_delayno == syncer_maxdelay) syncer_delayno = 0; splx(s); while ((vp = LIST_FIRST(slp)) != NULL) { @@ -182,11 +180,12 @@ sched_sync(p) } } +#ifdef FFS_SOFTUPDATES /* * Do soft update processing. */ - if (bioops.io_sync) - (*bioops.io_sync)(NULL); + softdep_process_worklist(NULL); +#endif /* * The variable rushjob allows the kernel to speed up the diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c index ec0f6bf9597..f74993737ae 100644 --- a/sys/kern/vfs_syscalls.c +++ b/sys/kern/vfs_syscalls.c @@ -1,4 +1,4 @@ -/* $OpenBSD: vfs_syscalls.c,v 1.67 2001/02/20 01:50:09 assar Exp $ */ +/* $OpenBSD: vfs_syscalls.c,v 1.68 2001/02/21 23:24:30 csapuntz Exp $ */ /* $NetBSD: vfs_syscalls.c,v 1.71 1996/04/23 10:29:02 mycroft Exp $ */ /* @@ -2280,9 +2280,10 @@ sys_fsync(p, v, retval) vp = (struct vnode *)fp->f_data; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); error = VOP_FSYNC(vp, fp->f_cred, MNT_WAIT, p); - if (error == 0 && bioops.io_fsync != NULL && - vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP)) - error = (*bioops.io_fsync)(vp); +#ifdef FFS_SOFTUPDATES + if (error == 0 && vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP)) + error = softdep_fsync(vp); +#endif VOP_UNLOCK(vp, 0, p); return (error); diff --git a/sys/sys/buf.h b/sys/sys/buf.h index 24c66801ab1..0ea3baee7b2 100644 --- a/sys/sys/buf.h +++ b/sys/sys/buf.h @@ -1,4 +1,4 @@ -/* $OpenBSD: buf.h,v 1.15 1999/02/26 02:15:41 art Exp $ */ +/* $OpenBSD: buf.h,v 1.16 2001/02/21 23:24:30 csapuntz Exp $ */ /* $NetBSD: buf.h,v 1.25 1997/04/09 21:12:17 mycroft Exp $ */ /* @@ -62,13 +62,12 @@ LIST_HEAD(workhead, worklist); * to use these hooks, a pointer to a set of bio_ops could be added * to each buffer. */ -struct mount; extern struct bio_ops { void (*io_start) __P((struct buf *)); void (*io_complete) __P((struct buf *)); - void (*io_deallocate) __P((struct buf *)); - int (*io_fsync) __P((struct vnode *)); - int (*io_sync) __P((struct mount *)); + void (*io_deallocate) __P((struct buf *)); + void (*io_movedeps) __P((struct buf *, struct buf *)); + int (*io_countdeps) __P((struct buf *, int)); } bioops; @@ -174,6 +173,7 @@ struct cluster_save { (bp)->b_resid = 0; \ } + /* Flags to low-level allocation routines. */ #define B_CLRBUF 0x01 /* Request allocated buffer be cleared. */ #define B_SYNC 0x02 /* Do all allocations synchronously. */ @@ -221,6 +221,44 @@ int physio __P((void (*strategy)(struct buf *), struct buf *bp, dev_t dev, void brelvp __P((struct buf *)); void reassignbuf __P((struct buf *, struct vnode *)); void bgetvp __P((struct vnode *, struct buf *)); + +static __inline void +buf_start(struct buf *bp) +{ + if (bioops.io_start) + (*bioops.io_start)(bp); +} + +static __inline void +buf_complete(struct buf *bp) +{ + if (bioops.io_complete) + (*bioops.io_complete)(bp); +} + +static __inline void +buf_deallocate(struct buf *bp) +{ + if (bioops.io_deallocate) + (*bioops.io_deallocate)(bp); +} + +static __inline void +buf_movedeps(struct buf *bp, struct buf *bp2) +{ + if (bioops.io_movedeps) + (*bioops.io_movedeps)(bp, bp2); +} + +static __inline int +buf_countdeps(struct buf *bp, int i) +{ + if (bioops.io_countdeps) + return ((*bioops.io_countdeps)(bp, i)); + else + return (0); +} + __END_DECLS #endif #endif /* !_SYS_BUF_H_ */ diff --git a/sys/sys/malloc.h b/sys/sys/malloc.h index eb63e284112..74a23c2fb15 100644 --- a/sys/sys/malloc.h +++ b/sys/sys/malloc.h @@ -1,4 +1,4 @@ -/* $OpenBSD: malloc.h,v 1.36 2001/02/21 08:03:52 csapuntz Exp $ */ +/* $OpenBSD: malloc.h,v 1.37 2001/02/21 23:24:30 csapuntz Exp $ */ /* $NetBSD: malloc.h,v 1.39 1998/07/12 19:52:01 augustss Exp $ */ /* @@ -54,7 +54,6 @@ */ #define M_WAITOK 0x0000 #define M_NOWAIT 0x0001 -#define M_ZERO 0x0008 /* * Types of memory to be allocated @@ -407,7 +406,6 @@ struct kmembuckets { } else { \ (space) = (cast)kbp->kb_next; \ kbp->kb_next = *(caddr_t *)(space); \ - if (flags & M_ZERO) bzero((space),(size)); \ } \ splx(s); \ } while (0) diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h index 9d5685c67e4..8251fb22e7e 100644 --- a/sys/sys/vnode.h +++ b/sys/sys/vnode.h @@ -1,4 +1,4 @@ -/* $OpenBSD: vnode.h,v 1.26 2000/11/21 21:49:56 provos Exp $ */ +/* $OpenBSD: vnode.h,v 1.27 2001/02/21 23:24:30 csapuntz Exp $ */ /* $NetBSD: vnode.h,v 1.38 1996/02/29 20:59:05 cgd Exp $ */ /* @@ -507,4 +507,9 @@ void vput __P((struct vnode *vp)); void vrele __P((struct vnode *vp)); int vaccess __P((mode_t file_mode, uid_t uid, gid_t gid, mode_t acc_mode, struct ucred *cred)); + +int vn_isdisk __P((struct vnode *vp, int *errp)); + +int softdep_fsync __P((struct vnode *vp)); + #endif /* _KERNEL */ diff --git a/sys/ufs/ffs/ffs_alloc.c b/sys/ufs/ffs/ffs_alloc.c index 7562ea203b9..9ddaf9f85f6 100644 --- a/sys/ufs/ffs/ffs_alloc.c +++ b/sys/ufs/ffs/ffs_alloc.c @@ -1,4 +1,4 @@ -/* $OpenBSD: ffs_alloc.c,v 1.18 2000/01/14 19:23:34 art Exp $ */ +/* $OpenBSD: ffs_alloc.c,v 1.19 2001/02/21 23:24:30 csapuntz Exp $ */ /* $NetBSD: ffs_alloc.c,v 1.11 1996/05/11 18:27:09 mycroft Exp $ */ /* @@ -317,7 +317,7 @@ nospace: * logical blocks to be made contiguous is given. The allocator attempts * to find a range of sequential blocks starting as close as possible to * an fs_rotdelay offset from the end of the allocation for the logical - * block immediately preceeding the current range. If successful, the + * block immediately preceding the current range. If successful, the * physical block numbers in the buffer pointers and in the inode are * changed to reflect the new allocation. If unsuccessful, the allocation * is left unchanged. The success in doing the reallocation is returned. @@ -1414,7 +1414,7 @@ ffs_vfree(v) if (DOINGSOFTDEP(ap->a_pvp)) { - softdep_freefile(ap); + softdep_freefile(ap->a_pvp, ap->a_ino, ap->a_mode); return (0); } diff --git a/sys/ufs/ffs/ffs_extern.h b/sys/ufs/ffs/ffs_extern.h index 44b77883d07..93dda25df9c 100644 --- a/sys/ufs/ffs/ffs_extern.h +++ b/sys/ufs/ffs/ffs_extern.h @@ -1,4 +1,4 @@ -/* $OpenBSD: ffs_extern.h,v 1.10 2001/02/20 01:50:12 assar Exp $ */ +/* $OpenBSD: ffs_extern.h,v 1.11 2001/02/21 23:24:31 csapuntz Exp $ */ /* $NetBSD: ffs_extern.h,v 1.4 1996/02/09 22:22:22 christos Exp $ */ /*- @@ -140,25 +140,23 @@ void softdep_initialize __P((void)); int softdep_process_worklist __P((struct mount *)); int softdep_mount __P((struct vnode *, struct mount *, struct fs *, struct ucred *)); +int softdep_flushworklist __P((struct mount *, int *, struct proc *)); int softdep_flushfiles __P((struct mount *, int, struct proc *)); void softdep_update_inodeblock __P((struct inode *, struct buf *, int)); void softdep_load_inodeblock __P((struct inode *)); -int softdep_fsync __P((struct vnode *)); -void softdep_freefile __P((struct vop_vfree_args *)); +void softdep_freefile __P((struct vnode *, ino_t, int)); void softdep_setup_freeblocks __P((struct inode *, off_t)); -void softdep_deallocate_dependencies __P((struct buf *)); void softdep_setup_inomapdep __P((struct buf *, struct inode *, ino_t)); -void softdep_setup_blkmapdep __P((struct buf *, struct fs *, daddr_t)); -void softdep_setup_allocdirect __P((struct inode *, ufs_lbn_t, daddr_t, - daddr_t, long, long, struct buf *)); +void softdep_setup_blkmapdep __P((struct buf *, struct fs *, ufs_daddr_t)); +void softdep_setup_allocdirect __P((struct inode *, ufs_lbn_t, ufs_daddr_t, + ufs_daddr_t, long, long, struct buf *)); void softdep_setup_allocindir_meta __P((struct buf *, struct inode *, - struct buf *, int, daddr_t)); + struct buf *, int, ufs_daddr_t)); void softdep_setup_allocindir_page __P((struct inode *, ufs_lbn_t, - struct buf *, int, daddr_t, daddr_t, struct buf *)); -void softdep_disk_io_initiation __P((struct buf *)); -void softdep_disk_write_complete __P((struct buf *)); -int softdep_sync_metadata __P((struct vop_fsync_args *)); + struct buf *, int, ufs_daddr_t, ufs_daddr_t, struct buf *)); void softdep_fsync_mountdev __P((struct vnode *)); +int softdep_sync_metadata __P((struct vop_fsync_args *)); +int softdep_fsync __P((struct vnode *vp)); __END_DECLS extern int (**ffs_vnodeop_p) __P((void *)); diff --git a/sys/ufs/ffs/ffs_inode.c b/sys/ufs/ffs/ffs_inode.c index b9f1ad90b8d..6607642eae3 100644 --- a/sys/ufs/ffs/ffs_inode.c +++ b/sys/ufs/ffs/ffs_inode.c @@ -1,4 +1,4 @@ -/* $OpenBSD: ffs_inode.c,v 1.16 2000/06/23 02:14:39 mickey Exp $ */ +/* $OpenBSD: ffs_inode.c,v 1.17 2001/02/21 23:24:31 csapuntz Exp $ */ /* $NetBSD: ffs_inode.c,v 1.10 1996/05/11 18:27:19 mycroft Exp $ */ /* @@ -139,7 +139,7 @@ ffs_update(v) *((struct dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number)) = ip->i_din.ffs_din; - if (ap->a_waitfor && (ap->a_vp->v_mount->mnt_flag & MNT_ASYNC) == 0) { + if (ap->a_waitfor && !DOINGASYNC(ap->a_vp)) { return (bwrite(bp)); } else { bdwrite(bp); @@ -210,7 +210,7 @@ ffs_truncate(v) #endif ovp->v_lasta = ovp->v_clen = ovp->v_cstart = ovp->v_lastw = 0; if (DOINGSOFTDEP(ovp)) { - if (length > 0) { + if (length > 0 || softdep_slowdown(ovp)) { /* * If a file is only partially truncated, then * we have to clean up the data structures @@ -510,7 +510,7 @@ ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp) bcopy((caddr_t)bap, (caddr_t)copy, (u_int)fs->fs_bsize); bzero((caddr_t)&bap[last + 1], (u_int)(NINDIR(fs) - (last + 1)) * sizeof (daddr_t)); - if ((vp->v_mount->mnt_flag & MNT_ASYNC) == 0) { + if (!DOINGASYNC(vp)) { error = bwrite(bp); if (error) allerror = error; diff --git a/sys/ufs/ffs/ffs_softdep.c b/sys/ufs/ffs/ffs_softdep.c index 771dd4562ca..12d9f631618 100644 --- a/sys/ufs/ffs/ffs_softdep.c +++ b/sys/ufs/ffs/ffs_softdep.c @@ -1,21 +1,17 @@ -/* $OpenBSD: ffs_softdep.c,v 1.11 2001/02/10 11:08:39 fgsch Exp $ */ +/* $OpenBSD: ffs_softdep.c,v 1.12 2001/02/21 23:24:31 csapuntz Exp $ */ /* - * Copyright 1998 Marshall Kirk McKusick. All Rights Reserved. + * Copyright 1998, 2000 Marshall Kirk McKusick. All Rights Reserved. * * The soft updates code is derived from the appendix of a University * of Michigan technical report (Gregory R. Ganger and Yale N. Patt, * "Soft Updates: A Solution to the Metadata Update Problem in File * Systems", CSE-TR-254-95, August 1995). * - * The following are the copyrights and redistribution conditions that - * apply to this copy of the soft update software. For a license - * to use, redistribute or sell the soft update software under - * conditions other than those described here, please contact the - * author at one of the following addresses: + * Further information about soft updates can be obtained from: * - * Marshall Kirk McKusick mckusick@mckusick.com - * 1614 Oxford Street +1-510-843-9542 - * Berkeley, CA 94709-1608 + * Marshall Kirk McKusick http://www.mckusick.com/softdep/ + * 1614 Oxford Street mckusick@mckusick.com + * Berkeley, CA 94709-1608 +1-510-843-9542 * USA * * Redistribution and use in source and binary forms, with or without @@ -27,19 +23,6 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. None of the names of McKusick, Ganger, Patt, or the University of - * Michigan may be used to endorse or promote products derived from - * this software without specific prior written permission. - * 4. Redistributions in any form must be accompanied by information on - * how to obtain complete source code for any accompanying software - * that uses this software. This source code must either be included - * in the distribution or be available for no more than the cost of - * distribution plus a nominal fee, and must be freely redistributable - * under reasonable conditions. For an executable file, complete - * source code means the source code for all modules it contains. - * It does not mean source code for modules or files that typically - * accompany the operating system on which the executable file runs, - * e.g., standard library modules or system header files. * * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED @@ -53,7 +36,8 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)ffs_softdep.c 9.40 (McKusick) 6/15/99 + * from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00 + * $FreeBSD: src/sys/ufs/ffs/ffs_softdep.c,v 1.84 2001/02/04 16:08:18 phk Exp $ */ /* @@ -64,6 +48,7 @@ #endif #ifndef DEBUG #define DEBUG +#define STATIC #endif #include <sys/param.h> @@ -85,10 +70,14 @@ #include <ufs/ffs/ffs_extern.h> #include <ufs/ufs/ufs_extern.h> + /* * These definitions need to be adapted to the system to which * this file is being ported. */ + +#define M_SOFTDEP_FLAGS (M_WAITOK) + /* * Mapping of dependency structure types to malloc types. */ @@ -122,62 +111,71 @@ extern char *memname[]; /* * Internal function prototypes. */ -static void softdep_error __P((char *, int)); -static void drain_output __P((struct vnode *, int)); -static int getdirtybuf __P((struct buf **, int)); -static void clear_remove __P((struct proc *)); -static void clear_inodedeps __P((struct proc *)); -static int flush_pagedep_deps __P((struct vnode *, struct mount *, +STATIC void softdep_error __P((char *, int)); +STATIC void drain_output __P((struct vnode *, int)); +STATIC int getdirtybuf __P((struct buf **, int)); +STATIC void clear_remove __P((struct proc *)); +STATIC void clear_inodedeps __P((struct proc *)); +STATIC int flush_pagedep_deps __P((struct vnode *, struct mount *, struct diraddhd *)); -static int flush_inodedep_deps __P((struct fs *, ino_t)); -static int handle_written_filepage __P((struct pagedep *, struct buf *)); -static void diradd_inode_written __P((struct diradd *, struct inodedep *)); -static int handle_written_inodeblock __P((struct inodedep *, struct buf *)); -static void handle_allocdirect_partdone __P((struct allocdirect *)); -static void handle_allocindir_partdone __P((struct allocindir *)); -static void initiate_write_filepage __P((struct pagedep *, struct buf *)); -static void handle_written_mkdir __P((struct mkdir *, int)); -static void initiate_write_inodeblock __P((struct inodedep *, struct buf *)); -static void handle_workitem_freefile __P((struct freefile *)); -static void handle_workitem_remove __P((struct dirrem *)); -static struct dirrem *newdirrem __P((struct buf *, struct inode *, - struct inode *, int)); -static void free_diradd __P((struct diradd *)); -static void free_allocindir __P((struct allocindir *, struct inodedep *)); -static int indir_trunc __P((struct inode *, ufs_daddr_t, int, ufs_lbn_t, +STATIC int flush_inodedep_deps __P((struct fs *, ino_t)); +STATIC int handle_written_filepage __P((struct pagedep *, struct buf *)); +STATIC void diradd_inode_written __P((struct diradd *, struct inodedep *)); +STATIC int handle_written_inodeblock __P((struct inodedep *, struct buf *)); +STATIC void handle_allocdirect_partdone __P((struct allocdirect *)); +STATIC void handle_allocindir_partdone __P((struct allocindir *)); +STATIC void initiate_write_filepage __P((struct pagedep *, struct buf *)); +STATIC void handle_written_mkdir __P((struct mkdir *, int)); +STATIC void initiate_write_inodeblock __P((struct inodedep *, struct buf *)); +STATIC void handle_workitem_freefile __P((struct freefile *)); +STATIC void handle_workitem_remove __P((struct dirrem *)); +STATIC struct dirrem *newdirrem __P((struct buf *, struct inode *, + struct inode *, int, struct dirrem **)); +STATIC void free_diradd __P((struct diradd *)); +STATIC void free_allocindir __P((struct allocindir *, struct inodedep *)); +STATIC int indir_trunc __P((struct inode *, ufs_daddr_t, int, ufs_lbn_t, long *)); -static void deallocate_dependencies __P((struct buf *, struct inodedep *)); -static void free_allocdirect __P((struct allocdirectlst *, +STATIC void deallocate_dependencies __P((struct buf *, struct inodedep *)); +STATIC void free_allocdirect __P((struct allocdirectlst *, struct allocdirect *, int)); -static int free_inodedep __P((struct inodedep *)); -static void handle_workitem_freeblocks __P((struct freeblks *)); -static void merge_inode_lists __P((struct inodedep *)); -static void setup_allocindir_phase2 __P((struct buf *, struct inode *, +STATIC int check_inode_unwritten __P((struct inodedep *)); +STATIC int free_inodedep __P((struct inodedep *)); +STATIC void handle_workitem_freeblocks __P((struct freeblks *)); +STATIC void merge_inode_lists __P((struct inodedep *)); +STATIC void setup_allocindir_phase2 __P((struct buf *, struct inode *, struct allocindir *)); -static struct allocindir *newallocindir __P((struct inode *, int, ufs_daddr_t, +STATIC struct allocindir *newallocindir __P((struct inode *, int, ufs_daddr_t, ufs_daddr_t)); -static void handle_workitem_freefrag __P((struct freefrag *)); -static struct freefrag *newfreefrag __P((struct inode *, ufs_daddr_t, long)); -static void allocdirect_merge __P((struct allocdirectlst *, +STATIC void handle_workitem_freefrag __P((struct freefrag *)); +STATIC struct freefrag *newfreefrag __P((struct inode *, ufs_daddr_t, long)); +STATIC void allocdirect_merge __P((struct allocdirectlst *, struct allocdirect *, struct allocdirect *)); -static struct bmsafemap *bmsafemap_lookup __P((struct buf *)); -static int newblk_lookup __P((struct fs *, ufs_daddr_t, int, +STATIC struct bmsafemap *bmsafemap_lookup __P((struct buf *)); +STATIC int newblk_lookup __P((struct fs *, ufs_daddr_t, int, struct newblk **)); -static int inodedep_lookup __P((struct fs *, ino_t, int, struct inodedep **)); -static int pagedep_lookup __P((struct inode *, ufs_lbn_t, int, +STATIC int inodedep_lookup __P((struct fs *, ino_t, int, struct inodedep **)); +STATIC int pagedep_lookup __P((struct inode *, ufs_lbn_t, int, struct pagedep **)); -static int request_cleanup __P((int, int)); -static void add_to_worklist __P((struct worklist *)); +STATIC void pause_timer __P((void *)); +STATIC int request_cleanup __P((int, int)); +STATIC int process_worklist_item __P((struct mount *, int)); +STATIC void add_to_worklist __P((struct worklist *)); /* * Exported softdep operations. */ +void softdep_disk_io_initiation __P((struct buf *)); +void softdep_disk_write_complete __P((struct buf *)); +void softdep_deallocate_dependencies __P((struct buf *)); +void softdep_move_dependencies __P((struct buf *, struct buf *)); +int softdep_count_dependencies __P((struct buf *bp, int)); + struct bio_ops bioops = { softdep_disk_io_initiation, /* io_start */ softdep_disk_write_complete, /* io_complete */ softdep_deallocate_dependencies, /* io_deallocate */ - softdep_fsync, /* io_fsync */ - softdep_process_worklist, /* io_sync */ + softdep_move_dependencies, /* io_movedeps */ + softdep_count_dependencies, /* io_countdeps */ }; /* @@ -196,7 +194,7 @@ struct bio_ops bioops = { * the spl, there is nothing that really needs to be done. */ #ifndef /* NOT */ DEBUG -static struct lockit { +STATIC struct lockit { int lkt_spl; } lk = { 0 }; #define ACQUIRE_LOCK(lk) (lk)->lkt_spl = splbio() @@ -205,72 +203,78 @@ static struct lockit { #define FREE_LOCK_INTERLOCKED(lk) #else /* DEBUG */ -static struct lockit { +STATIC struct lockit { int lkt_spl; pid_t lkt_held; + int lkt_line; } lk = { 0, -1 }; -static int lockcnt; +STATIC int lockcnt; -static void acquire_lock __P((struct lockit *)); -static void free_lock __P((struct lockit *)); -static void acquire_lock_interlocked __P((struct lockit *)); -static void free_lock_interlocked __P((struct lockit *)); +STATIC void acquire_lock __P((struct lockit *, int)); +STATIC void free_lock __P((struct lockit *, int)); +STATIC void acquire_lock_interlocked __P((struct lockit *, int)); +STATIC void free_lock_interlocked __P((struct lockit *, int)); -#define ACQUIRE_LOCK(lk) acquire_lock(lk) -#define FREE_LOCK(lk) free_lock(lk) -#define ACQUIRE_LOCK_INTERLOCKED(lk) acquire_lock_interlocked(lk) -#define FREE_LOCK_INTERLOCKED(lk) free_lock_interlocked(lk) +#define ACQUIRE_LOCK(lk) acquire_lock(lk, __LINE__) +#define FREE_LOCK(lk) free_lock(lk, __LINE__) +#define ACQUIRE_LOCK_INTERLOCKED(lk) acquire_lock_interlocked(lk, __LINE__) +#define FREE_LOCK_INTERLOCKED(lk) free_lock_interlocked(lk, __LINE__) -static void -acquire_lock(lk) +STATIC void +acquire_lock(lk, line) struct lockit *lk; + int line; { if (lk->lkt_held != -1) { if (lk->lkt_held == CURPROC->p_pid) - panic("softdep_lock: locking against myself"); + panic("softdep_lock: locking against myself, acquired at line %d", lk->lkt_line); else - panic("softdep_lock: lock held by %d", lk->lkt_held); + panic("softdep_lock: lock held by %d, acquired at line %d", lk->lkt_held, line); } lk->lkt_spl = splbio(); lk->lkt_held = CURPROC->p_pid; + lk->lkt_line = line; lockcnt++; } -static void -free_lock(lk) +STATIC void +free_lock(lk, line) struct lockit *lk; + int line; { if (lk->lkt_held == -1) - panic("softdep_unlock: lock not held"); + panic("softdep_unlock: lock not held at line %d", line); lk->lkt_held = -1; splx(lk->lkt_spl); } -static void -acquire_lock_interlocked(lk) +STATIC void +acquire_lock_interlocked(lk, line) struct lockit *lk; + int line; { if (lk->lkt_held != -1) { if (lk->lkt_held == CURPROC->p_pid) - panic("softdep_lock_interlocked: locking against self"); + panic("softdep_lock: locking against myself, acquired at line %d", lk->lkt_line); else - panic("softdep_lock_interlocked: lock held by %d", - lk->lkt_held); + panic("softdep_lock: lock held by %d, acquired at line %d", lk->lkt_held, lk->lkt_line); } lk->lkt_held = CURPROC->p_pid; + lk->lkt_line = line; lockcnt++; } -static void -free_lock_interlocked(lk) +STATIC void +free_lock_interlocked(lk, line) struct lockit *lk; + int line; { if (lk->lkt_held == -1) - panic("softdep_unlock_interlocked: lock not held"); + panic("softdep_unlock_interlocked: lock not held at line %d", line); lk->lkt_held = -1; } #endif /* DEBUG */ @@ -285,11 +289,11 @@ struct sema { int prio; int timo; }; -static void sema_init __P((struct sema *, char *, int, int)); -static int sema_get __P((struct sema *, struct lockit *)); -static void sema_release __P((struct sema *)); +STATIC void sema_init __P((struct sema *, char *, int, int)); +STATIC int sema_get __P((struct sema *, struct lockit *)); +STATIC void sema_release __P((struct sema *)); -static void +STATIC void sema_init(semap, name, prio, timo) struct sema *semap; char *name; @@ -303,7 +307,7 @@ sema_init(semap, name, prio, timo) semap->timo = timo; } -static int +STATIC int sema_get(semap, interlock) struct sema *semap; struct lockit *interlock; @@ -325,7 +329,7 @@ sema_get(semap, interlock) return (1); } -static void +STATIC void sema_release(semap) struct sema *semap; { @@ -355,15 +359,15 @@ sema_release(semap) #define WORKITEM_FREE(item, type) FREE(item, DtoM(type)) #else /* DEBUG */ -static void worklist_insert __P((struct workhead *, struct worklist *)); -static void worklist_remove __P((struct worklist *)); -static void workitem_free __P((struct worklist *, int)); +STATIC void worklist_insert __P((struct workhead *, struct worklist *)); +STATIC void worklist_remove __P((struct worklist *)); +STATIC void workitem_free __P((struct worklist *, int)); #define WORKLIST_INSERT(head, item) worklist_insert(head, item) #define WORKLIST_REMOVE(item) worklist_remove(item) #define WORKITEM_FREE(item, type) workitem_free((struct worklist *)item, type) -static void +STATIC void worklist_insert(head, item) struct workhead *head; struct worklist *item; @@ -377,7 +381,7 @@ worklist_insert(head, item) LIST_INSERT_HEAD(head, item, wk_list); } -static void +STATIC void worklist_remove(item) struct worklist *item; { @@ -390,7 +394,7 @@ worklist_remove(item) LIST_REMOVE(item, wk_list); } -static void +STATIC void workitem_free(item, type) struct worklist *item; int type; @@ -407,42 +411,59 @@ workitem_free(item, type) /* * Workitem queue management */ -static struct workhead softdep_workitem_pending; -static int softdep_worklist_busy; -static int max_softdeps; /* maximum number of structs before slowdown */ -static int tickdelay = 2; /* number of ticks to pause during slowdown */ -static int proc_waiting; /* tracks whether we have a timeout posted */ -static struct proc *filesys_syncer; /* proc of filesystem syncer process */ -static int req_clear_inodedeps; /* syncer process flush some inodedeps */ +STATIC struct workhead softdep_workitem_pending; +STATIC int num_on_worklist; /* number of worklist items to be processed */ +STATIC int softdep_worklist_busy; /* 1 => trying to do unmount */ +STATIC int softdep_worklist_req; /* serialized waiters */ +STATIC int max_softdeps; /* maximum number of structs before slowdown */ +STATIC int tickdelay = 2; /* number of ticks to pause during slowdown */ +STATIC int proc_waiting; /* tracks whether we have a timeout posted */ +STATIC int *stat_countp; /* statistic to count in proc_waiting timeout */ +STATIC struct timeout proc_waiting_timeout; +STATIC struct proc *filesys_syncer; /* proc of filesystem syncer process */ +STATIC int req_clear_inodedeps; /* syncer process flush some inodedeps */ #define FLUSH_INODES 1 -static int req_clear_remove; /* syncer process flush some freeblks */ +STATIC int req_clear_remove; /* syncer process flush some freeblks */ #define FLUSH_REMOVE 2 /* * runtime statistics */ -static int stat_blk_limit_push; /* number of times block limit neared */ -static int stat_ino_limit_push; /* number of times inode limit neared */ -static int stat_blk_limit_hit; /* number of times block slowdown imposed */ -static int stat_ino_limit_hit; /* number of times inode slowdown imposed */ -static int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */ -static int stat_inode_bitmap; /* bufs redirtied as inode bitmap not written */ -static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */ -static int stat_dir_entry; /* bufs redirtied as dir entry cannot write */ +STATIC int stat_worklist_push; /* number of worklist cleanups */ +STATIC int stat_blk_limit_push; /* number of times block limit neared */ +STATIC int stat_ino_limit_push; /* number of times inode limit neared */ +STATIC int stat_blk_limit_hit; /* number of times block slowdown imposed */ +STATIC int stat_ino_limit_hit; /* number of times inode slowdown imposed */ +STATIC int stat_sync_limit_hit; /* number of synchronous slowdowns imposed */ +STATIC int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */ +STATIC int stat_inode_bitmap; /* bufs redirtied as inode bitmap not written */ +STATIC int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */ +STATIC int stat_dir_entry; /* bufs redirtied as dir entry cannot write */ #ifdef DEBUG #include <vm/vm.h> #include <sys/sysctl.h> struct ctldebug debug20 = { "max_softdeps", &max_softdeps }; struct ctldebug debug21 = { "tickdelay", &tickdelay }; +struct ctldebug debug22 = { "worklist_push", &stat_worklist_push }; struct ctldebug debug23 = { "blk_limit_push", &stat_blk_limit_push }; struct ctldebug debug24 = { "ino_limit_push", &stat_ino_limit_push }; struct ctldebug debug25 = { "blk_limit_hit", &stat_blk_limit_hit }; struct ctldebug debug26 = { "ino_limit_hit", &stat_ino_limit_hit }; -struct ctldebug debug27 = { "indir_blk_ptrs", &stat_indir_blk_ptrs }; -struct ctldebug debug28 = { "inode_bitmap", &stat_inode_bitmap }; -struct ctldebug debug29 = { "direct_blk_ptrs", &stat_direct_blk_ptrs }; -struct ctldebug debug30 = { "dir_entry", &stat_dir_entry }; +struct ctldebug debug27 = { "sync_limit_hit", &stat_sync_limit_hit }; +struct ctldebug debug28 = { "indir_blk_ptrs", &stat_indir_blk_ptrs }; +struct ctldebug debug29 = { "inode_bitmap", &stat_inode_bitmap }; +struct ctldebug debug30 = { "direct_blk_ptrs", &stat_direct_blk_ptrs }; +struct ctldebug debug31 = { "dir_entry", &stat_dir_entry }; #endif /* DEBUG */ +void wakeup_one __P((void *)); + +void +wakeup_one(c) + void *c; +{ + wakeup(c); +} + /* * Add an item to the end of the work queue. * This routine requires that the lock be held. @@ -450,7 +471,7 @@ struct ctldebug debug30 = { "dir_entry", &stat_dir_entry }; * The following routine is the only one that removes items * and does so in order from first to last. */ -static void +STATIC void add_to_worklist(wk) struct worklist *wk; { @@ -464,6 +485,7 @@ add_to_worklist(wk) else LIST_INSERT_AFTER(worklist_tail, wk, wk_list); worklist_tail = wk; + num_on_worklist += 1; } /* @@ -480,9 +502,8 @@ softdep_process_worklist(matchmnt) struct mount *matchmnt; { struct proc *p = CURPROC; - struct worklist *wk; - struct fs *matchfs; - int matchcnt; + int matchcnt, loopcount; + struct timeval starttime; /* * Record the process identifier of our caller so that we can give @@ -490,133 +511,243 @@ softdep_process_worklist(matchmnt) */ filesys_syncer = p; matchcnt = 0; - matchfs = NULL; - if (matchmnt != NULL) - matchfs = VFSTOUFS(matchmnt)->um_fs; + /* * There is no danger of having multiple processes run this - * code. It is single threaded solely so that softdep_flushfiles - * (below) can get an accurate count of the number of items + * code, but we have to single-thread it when softdep_flushfiles() + * is in operation to get an accurate count of the number of items * related to its mount point that are in the list. */ - if (softdep_worklist_busy && matchmnt == NULL) - return (-1); + if (matchmnt == NULL) { + if (softdep_worklist_busy < 0) + return(-1); + softdep_worklist_busy += 1; + } + /* * If requested, try removing inode or removal dependencies. */ if (req_clear_inodedeps) { clear_inodedeps(p); - req_clear_inodedeps = 0; - wakeup(&proc_waiting); + req_clear_inodedeps -= 1; + wakeup_one(&proc_waiting); } if (req_clear_remove) { clear_remove(p); - req_clear_remove = 0; - wakeup(&proc_waiting); + req_clear_remove -= 1; + wakeup_one(&proc_waiting); } - ACQUIRE_LOCK(&lk); - while ((wk = LIST_FIRST(&softdep_workitem_pending)) != 0) { - WORKLIST_REMOVE(wk); - FREE_LOCK(&lk); - switch (wk->wk_type) { - - case D_DIRREM: - /* removal of a directory entry */ - if (WK_DIRREM(wk)->dm_mnt == matchmnt) - matchcnt += 1; - handle_workitem_remove(WK_DIRREM(wk)); - break; - - case D_FREEBLKS: - /* releasing blocks and/or fragments from a file */ - if (WK_FREEBLKS(wk)->fb_fs == matchfs) - matchcnt += 1; - handle_workitem_freeblocks(WK_FREEBLKS(wk)); - break; - - case D_FREEFRAG: - /* releasing a fragment when replaced as a file grows */ - if (WK_FREEFRAG(wk)->ff_fs == matchfs) - matchcnt += 1; - handle_workitem_freefrag(WK_FREEFRAG(wk)); - break; + loopcount = 1; + starttime = time; + while (num_on_worklist > 0) { + matchcnt += process_worklist_item(matchmnt, 0); - case D_FREEFILE: - /* releasing an inode when its link count drops to 0 */ - if (WK_FREEFILE(wk)->fx_fs == matchfs) - matchcnt += 1; - handle_workitem_freefile(WK_FREEFILE(wk)); + /* + * If a umount operation wants to run the worklist + * accurately, abort. + */ + if (softdep_worklist_req && matchmnt == NULL) { + matchcnt = -1; break; - - default: - panic("%s_process_worklist: Unknown type %s", - "softdep", TYPENAME(wk->wk_type)); - /* NOTREACHED */ } - if (softdep_worklist_busy && matchmnt == NULL) - return (-1); + /* * If requested, try removing inode or removal dependencies. */ if (req_clear_inodedeps) { clear_inodedeps(p); - req_clear_inodedeps = 0; - wakeup(&proc_waiting); + req_clear_inodedeps -= 1; + wakeup_one(&proc_waiting); } if (req_clear_remove) { clear_remove(p); - req_clear_remove = 0; - wakeup(&proc_waiting); + req_clear_remove -= 1; + wakeup_one(&proc_waiting); } - ACQUIRE_LOCK(&lk); + /* + * We do not generally want to stop for buffer space, but if + * we are really being a buffer hog, we will stop and wait. + */ +#if 0 + if (loopcount++ % 128 == 0) + bwillwrite(); +#endif + /* + * Never allow processing to run for more than one + * second. Otherwise the other syncer tasks may get + * excessively backlogged. + */ + { + struct timeval diff; + + timersub(&time, &starttime, &diff); + if (diff.tv_sec > 0 && matchmnt == NULL) { + matchcnt = -1; + break; + } + } + } + if (matchmnt == NULL) { + softdep_worklist_busy -= 1; + if (softdep_worklist_req && softdep_worklist_busy == 0) + wakeup(&softdep_worklist_req); + } + return (matchcnt); +} + +/* + * Process one item on the worklist. + */ +STATIC int +process_worklist_item(matchmnt, flags) + struct mount *matchmnt; + int flags; +{ + struct worklist *wk; + struct dirrem *dirrem; + struct mount *mp; + struct vnode *vp; + int matchcnt = 0; + + ACQUIRE_LOCK(&lk); + /* + * Normally we just process each item on the worklist in order. + * However, if we are in a situation where we cannot lock any + * inodes, we have to skip over any dirrem requests whose + * vnodes are resident and locked. + */ + LIST_FOREACH(wk, &softdep_workitem_pending, wk_list) { + if ((flags & LK_NOWAIT) == 0 || wk->wk_type != D_DIRREM) + break; + dirrem = WK_DIRREM(wk); + vp = ufs_ihashlookup(VFSTOUFS(dirrem->dm_mnt)->um_dev, + dirrem->dm_oldinum); + if (vp == NULL || !VOP_ISLOCKED(vp)) + break; } + if (wk == 0) + return (0); + WORKLIST_REMOVE(wk); + num_on_worklist -= 1; FREE_LOCK(&lk); + switch (wk->wk_type) { + + case D_DIRREM: + /* removal of a directory entry */ + mp = WK_DIRREM(wk)->dm_mnt; +#if 0 + if (vn_write_suspend_wait(NULL, mp, V_NOWAIT)) + panic("%s: dirrem on suspended filesystem", + "process_worklist_item"); +#endif + if (mp == matchmnt) + matchcnt += 1; + handle_workitem_remove(WK_DIRREM(wk)); + break; + + case D_FREEBLKS: + /* releasing blocks and/or fragments from a file */ + mp = WK_FREEBLKS(wk)->fb_mnt; +#if 0 + if (vn_write_suspend_wait(NULL, mp, V_NOWAIT)) + panic("%s: freeblks on suspended filesystem", + "process_worklist_item"); +#endif + if (mp == matchmnt) + matchcnt += 1; + handle_workitem_freeblocks(WK_FREEBLKS(wk)); + break; + + case D_FREEFRAG: + /* releasing a fragment when replaced as a file grows */ + mp = WK_FREEFRAG(wk)->ff_mnt; +#if 0 + if (vn_write_suspend_wait(NULL, mp, V_NOWAIT)) + panic("%s: freefrag on suspended filesystem", + "process_worklist_item"); +#endif + if (mp == matchmnt) + matchcnt += 1; + handle_workitem_freefrag(WK_FREEFRAG(wk)); + break; + + case D_FREEFILE: + /* releasing an inode when its link count drops to 0 */ + mp = WK_FREEFILE(wk)->fx_mnt; +#if 0 + if (vn_write_suspend_wait(NULL, mp, V_NOWAIT)) + panic("%s: freefile on suspended filesystem", + "process_worklist_item"); +#endif + if (mp == matchmnt) + matchcnt += 1; + handle_workitem_freefile(WK_FREEFILE(wk)); + break; + + default: + panic("%s_process_worklist: Unknown type %s", + "softdep", TYPENAME(wk->wk_type)); + /* NOTREACHED */ + } return (matchcnt); } /* + * Move dependencies from one buffer to another. + */ +void +softdep_move_dependencies(oldbp, newbp) + struct buf *oldbp; + struct buf *newbp; +{ + struct worklist *wk, *wktail; + + if (LIST_FIRST(&newbp->b_dep) != NULL) + panic("softdep_move_dependencies: need merge code"); + wktail = 0; + ACQUIRE_LOCK(&lk); + while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) { + LIST_REMOVE(wk, wk_list); + if (wktail == 0) + LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list); + else + LIST_INSERT_AFTER(wktail, wk, wk_list); + wktail = wk; + } + FREE_LOCK(&lk); +} + +/* * Purge the work list of all items associated with a particular mount point. */ int -softdep_flushfiles(oldmnt, flags, p) +softdep_flushworklist(oldmnt, countp, p) struct mount *oldmnt; - int flags; + int *countp; struct proc *p; { struct vnode *devvp; - int error, loopcnt; + int count, error = 0; /* - * Await our turn to clear out the queue. + * Await our turn to clear out the queue, then serialize access. */ - while (softdep_worklist_busy) - tsleep(&lbolt, PRIBIO, "softflush", 0); - softdep_worklist_busy = 1; - if ((error = ffs_flushfiles(oldmnt, flags, p)) != 0) { - softdep_worklist_busy = 0; - return (error); + while (softdep_worklist_busy) { + softdep_worklist_req += 1; + tsleep(&softdep_worklist_req, PRIBIO, "softflush", 0); + softdep_worklist_req -= 1; } + softdep_worklist_busy = -1; /* * Alternately flush the block device associated with the mount * point and process any dependencies that the flushing - * creates. In theory, this loop can happen at most twice, - * but we give it a few extra just to be sure. + * creates. We continue until no more worklist dependencies + * are found. */ + *countp = 0; devvp = VFSTOUFS(oldmnt)->um_devvp; - for (loopcnt = 10; loopcnt > 0; loopcnt--) { - if (softdep_process_worklist(oldmnt) == 0) { - /* - * Do another flush in case any vnodes were brought in - * as part of the cleanup operations. - */ - if ((error = ffs_flushfiles(oldmnt, flags, p)) != 0) - break; - /* - * If we still found nothing to do, we are really done. - */ - if (softdep_process_worklist(oldmnt) == 0) - break; - } + while ((count = softdep_process_worklist(oldmnt)) > 0) { + *countp += count; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, p); error = VOP_FSYNC(devvp, p->p_ucred, MNT_WAIT, p); VOP_UNLOCK(devvp, 0, p); @@ -624,6 +755,39 @@ softdep_flushfiles(oldmnt, flags, p) break; } softdep_worklist_busy = 0; + if (softdep_worklist_req) + wakeup(&softdep_worklist_req); + return (error); +} + +/* + * Flush all vnodes and worklist items associated with a specified mount point. + */ +int +softdep_flushfiles(oldmnt, flags, p) + struct mount *oldmnt; + int flags; + struct proc *p; +{ + int error, count, loopcnt; + + /* + * Alternately flush the vnodes associated with the mount + * point and process any dependencies that the flushing + * creates. In theory, this loop can happen at most twice, + * but we give it a few extra just to be sure. + */ + for (loopcnt = 10; loopcnt > 0; loopcnt--) { + /* + * Do another flush in case any vnodes were brought in + * as part of the cleanup operations. + */ + if ((error = ffs_flushfiles(oldmnt, flags, p)) != 0) + break; + if ((error = softdep_flushworklist(oldmnt, &count, p)) != 0 || + count == 0) + break; + } /* * If we are unmounting then it is an error to fail. If we * are simply trying to downgrade to read-only, then filesystem @@ -660,6 +824,7 @@ softdep_flushfiles(oldmnt, flags, p) * an existing entry is not found. */ #define DEPALLOC 0x0001 /* allocate structure if lookup fails */ +#define NODELAY 0x0002 /* cannot do background work */ /* * Structures and routines associated with pagedep caching. @@ -669,7 +834,7 @@ u_long pagedep_hash; /* size of hash table - 1 */ #define PAGEDEP_HASH(mp, inum, lbn) \ (&pagedep_hashtbl[((((register_t)(mp)) >> 13) + (inum) + (lbn)) & \ pagedep_hash]) -static struct sema pagedep_in_progress; +STATIC struct sema pagedep_in_progress; /* * Look up a pagedep. Return 1 if found, 0 if not found. @@ -677,7 +842,7 @@ static struct sema pagedep_in_progress; * Found or allocated entry is returned in pagedeppp. * This routine must be called with splbio interrupts blocked. */ -static int +STATIC int pagedep_lookup(ip, lbn, flags, pagedeppp) struct inode *ip; ufs_lbn_t lbn; @@ -696,8 +861,7 @@ pagedep_lookup(ip, lbn, flags, pagedeppp) mp = ITOV(ip)->v_mount; pagedephd = PAGEDEP_HASH(mp, ip->i_number, lbn); top: - for (pagedep = LIST_FIRST(pagedephd); pagedep; - pagedep = LIST_NEXT(pagedep, pd_hash)) + LIST_FOREACH(pagedep, pagedephd, pd_hash) if (ip->i_number == pagedep->pd_ino && lbn == pagedep->pd_lbn && mp == pagedep->pd_mnt) @@ -715,7 +879,7 @@ top: goto top; } MALLOC(pagedep, struct pagedep *, sizeof(struct pagedep), M_PAGEDEP, - M_WAITOK); + M_SOFTDEP_FLAGS); bzero(pagedep, sizeof(struct pagedep)); pagedep->pd_list.wk_type = D_PAGEDEP; pagedep->pd_mnt = mp; @@ -736,11 +900,11 @@ top: * Structures and routines associated with inodedep caching. */ LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl; -static u_long inodedep_hash; /* size of hash table - 1 */ -static long num_inodedep; /* number of inodedep allocated */ +STATIC u_long inodedep_hash; /* size of hash table - 1 */ +STATIC long num_inodedep; /* number of inodedep allocated */ #define INODEDEP_HASH(fs, inum) \ (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash]) -static struct sema inodedep_in_progress; +STATIC struct sema inodedep_in_progress; /* * Look up a inodedep. Return 1 if found, 0 if not found. @@ -748,7 +912,7 @@ static struct sema inodedep_in_progress; * Found or allocated entry is returned in inodedeppp. * This routine must be called with splbio interrupts blocked. */ -static int +STATIC int inodedep_lookup(fs, inum, flags, inodedeppp) struct fs *fs; ino_t inum; @@ -766,8 +930,7 @@ inodedep_lookup(fs, inum, flags, inodedeppp) firsttry = 1; inodedephd = INODEDEP_HASH(fs, inum); top: - for (inodedep = LIST_FIRST(inodedephd); inodedep; - inodedep = LIST_NEXT(inodedep, id_hash)) + LIST_FOREACH(inodedep, inodedephd, id_hash) if (inum == inodedep->id_ino && fs == inodedep->id_fs) break; if (inodedep) { @@ -781,7 +944,7 @@ top: /* * If we are over our limit, try to improve the situation. */ - if (num_inodedep > max_softdeps && firsttry && speedup_syncer() == 0 && + if (num_inodedep > max_softdeps && firsttry && (flags & NODELAY) == 0 && request_cleanup(FLUSH_INODES, 1)) { firsttry = 0; goto top; @@ -792,7 +955,7 @@ top: } num_inodedep += 1; MALLOC(inodedep, struct inodedep *, sizeof(struct inodedep), - M_INODEDEP, M_WAITOK); + M_INODEDEP, M_SOFTDEP_FLAGS); inodedep->id_list.wk_type = D_INODEDEP; inodedep->id_fs = fs; inodedep->id_ino = inum; @@ -820,14 +983,14 @@ LIST_HEAD(newblk_hashhead, newblk) *newblk_hashtbl; u_long newblk_hash; /* size of hash table - 1 */ #define NEWBLK_HASH(fs, inum) \ (&newblk_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & newblk_hash]) -static struct sema newblk_in_progress; +STATIC struct sema newblk_in_progress; /* * Look up a newblk. Return 1 if found, 0 if not found. * If not found, allocate if DEPALLOC flag is passed. * Found or allocated entry is returned in newblkpp. */ -static int +STATIC int newblk_lookup(fs, newblkno, flags, newblkpp) struct fs *fs; ufs_daddr_t newblkno; @@ -839,8 +1002,7 @@ newblk_lookup(fs, newblkno, flags, newblkpp) newblkhd = NEWBLK_HASH(fs, newblkno); top: - for (newblk = LIST_FIRST(newblkhd); newblk; - newblk = LIST_NEXT(newblk, nb_hash)) + LIST_FOREACH(newblk, newblkhd, nb_hash) if (newblkno == newblk->nb_newblkno && fs == newblk->nb_fs) break; if (newblk) { @@ -854,7 +1016,7 @@ top: if (sema_get(&newblk_in_progress, 0) == 0) goto top; MALLOC(newblk, struct newblk *, sizeof(struct newblk), - M_NEWBLK, M_WAITOK); + M_NEWBLK, M_SOFTDEP_FLAGS); newblk->nb_state = 0; newblk->nb_fs = fs; newblk->nb_newblkno = newblkno; @@ -874,7 +1036,8 @@ softdep_initialize() LIST_INIT(&mkdirlisthd); LIST_INIT(&softdep_workitem_pending); - max_softdeps = desiredvnodes * (16 / sizeof(register_t)); + max_softdeps = min (desiredvnodes * 8, + kmemstats[M_INODEDEP].ks_limit / (2 * sizeof(struct inodedep))); pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP, M_WAITOK, &pagedep_hash); sema_init(&pagedep_in_progress, "pagedep", PRIBIO, 0); @@ -901,6 +1064,7 @@ softdep_mount(devvp, mp, fs, cred) struct buf *bp; int error, cyl; + mp->mnt_flag &= ~MNT_ASYNC; mp->mnt_flag |= MNT_SOFTDEP; /* * When doing soft updates, the counters in the @@ -983,7 +1147,8 @@ softdep_setup_inomapdep(bp, ip, newinum) * the cylinder group map from which it was allocated. */ ACQUIRE_LOCK(&lk); - if (inodedep_lookup(ip->i_fs, newinum, DEPALLOC, &inodedep) != 0) + if (inodedep_lookup(ip->i_fs, newinum, DEPALLOC | NODELAY, &inodedep) + != 0) panic("softdep_setup_inomapdep: found inode"); inodedep->id_buf = bp; inodedep->id_state &= ~DEPCOMPLETE; @@ -1024,7 +1189,7 @@ softdep_setup_blkmapdep(bp, fs, newblkno) * this routine is called and this routine must be called with * splbio interrupts blocked. */ -static struct bmsafemap * +STATIC struct bmsafemap * bmsafemap_lookup(bp) struct buf *bp; { @@ -1035,12 +1200,12 @@ bmsafemap_lookup(bp) if (lk.lkt_held == -1) panic("bmsafemap_lookup: lock not held"); #endif - for (wk = LIST_FIRST(&bp->b_dep); wk; wk = LIST_NEXT(wk, wk_list)) + LIST_FOREACH(wk, &bp->b_dep, wk_list) if (wk->wk_type == D_BMSAFEMAP) return (WK_BMSAFEMAP(wk)); FREE_LOCK(&lk); MALLOC(bmsafemap, struct bmsafemap *, sizeof(struct bmsafemap), - M_BMSAFEMAP, M_WAITOK); + M_BMSAFEMAP, M_SOFTDEP_FLAGS); bmsafemap->sm_list.wk_type = D_BMSAFEMAP; bmsafemap->sm_list.wk_state = 0; bmsafemap->sm_buf = bp; @@ -1100,7 +1265,7 @@ softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) struct newblk *newblk; MALLOC(adp, struct allocdirect *, sizeof(struct allocdirect), - M_ALLOCDIRECT, M_WAITOK); + M_ALLOCDIRECT, M_SOFTDEP_FLAGS); bzero(adp, sizeof(struct allocdirect)); adp->ad_list.wk_type = D_ALLOCDIRECT; adp->ad_lbn = lbn; @@ -1118,7 +1283,7 @@ softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) panic("softdep_setup_allocdirect: lost block"); ACQUIRE_LOCK(&lk); - (void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, &inodedep); + inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC | NODELAY, &inodedep); adp->ad_inodedep = inodedep; if (newblk->nb_state == DEPCOMPLETE) { @@ -1172,8 +1337,7 @@ softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) FREE_LOCK(&lk); return; } - for (oldadp = TAILQ_FIRST(adphead); oldadp; - oldadp = TAILQ_NEXT(oldadp, ad_next)) { + TAILQ_FOREACH(oldadp, adphead, ad_next) { if (oldadp->ad_lbn >= lbn) break; } @@ -1190,7 +1354,7 @@ softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) * Replace an old allocdirect dependency with a newer one. * This routine must be called with splbio interrupts blocked. */ -static void +STATIC void allocdirect_merge(adphead, newadp, oldadp) struct allocdirectlst *adphead; /* head of list holding allocdirects */ struct allocdirect *newadp; /* allocdirect being added */ @@ -1237,7 +1401,7 @@ allocdirect_merge(adphead, newadp, oldadp) /* * Allocate a new freefrag structure if needed. */ -static struct freefrag * +STATIC struct freefrag * newfreefrag(ip, blkno, size) struct inode *ip; ufs_daddr_t blkno; @@ -1252,11 +1416,11 @@ newfreefrag(ip, blkno, size) if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag) panic("newfreefrag: frag size"); MALLOC(freefrag, struct freefrag *, sizeof(struct freefrag), - M_FREEFRAG, M_WAITOK); + M_FREEFRAG, M_SOFTDEP_FLAGS); freefrag->ff_list.wk_type = D_FREEFRAG; freefrag->ff_state = ip->i_ffs_uid & ~ONWORKLIST; /* XXX - used below */ freefrag->ff_inum = ip->i_number; - freefrag->ff_fs = fs; + freefrag->ff_mnt = ITOV(ip)->v_mount; freefrag->ff_devvp = ip->i_devvp; freefrag->ff_blkno = blkno; freefrag->ff_fragsize = size; @@ -1267,13 +1431,14 @@ newfreefrag(ip, blkno, size) * This workitem de-allocates fragments that were replaced during * file block allocation. */ -static void +STATIC void handle_workitem_freefrag(freefrag) struct freefrag *freefrag; { struct inode tip; - tip.i_fs = freefrag->ff_fs; + tip.i_vnode = NULL; + tip.i_fs = VFSTOUFS(freefrag->ff_mnt)->um_fs; tip.i_devvp = freefrag->ff_devvp; tip.i_dev = freefrag->ff_devvp->v_rdev; tip.i_number = freefrag->ff_inum; @@ -1310,7 +1475,7 @@ handle_workitem_freefrag(freefrag) /* * Allocate a new allocindir structure. */ -static struct allocindir * +STATIC struct allocindir * newallocindir(ip, ptrno, newblkno, oldblkno) struct inode *ip; /* inode for file being extended */ int ptrno; /* offset of pointer in indirect block */ @@ -1320,8 +1485,8 @@ newallocindir(ip, ptrno, newblkno, oldblkno) struct allocindir *aip; MALLOC(aip, struct allocindir *, sizeof(struct allocindir), - M_ALLOCINDIR, M_WAITOK); - bzero(aip, sizeof(struct allocindir)); + M_ALLOCINDIR, M_SOFTDEP_FLAGS); + bzero(aip,sizeof(struct allocindir)); aip->ai_list.wk_type = D_ALLOCINDIR; aip->ai_state = ATTACHED; aip->ai_offset = ptrno; @@ -1388,7 +1553,7 @@ softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno) * Called to finish the allocation of the "aip" allocated * by one of the two routines above. */ -static void +STATIC void setup_allocindir_phase2(bp, ip, aip) struct buf *bp; /* in-memory copy of the indirect block */ struct inode *ip; /* inode for file being extended */ @@ -1405,8 +1570,7 @@ setup_allocindir_phase2(bp, ip, aip) panic("setup_allocindir_phase2: not indir blk"); for (indirdep = NULL, newindirdep = NULL; ; ) { ACQUIRE_LOCK(&lk); - for (wk = LIST_FIRST(&bp->b_dep); wk; - wk = LIST_NEXT(wk, wk_list)) { + LIST_FOREACH(wk, &bp->b_dep, wk_list) { if (wk->wk_type != D_INDIRDEP) continue; indirdep = WK_INDIRDEP(wk); @@ -1444,23 +1608,26 @@ setup_allocindir_phase2(bp, ip, aip) if (aip->ai_oldblkno == 0) oldaip = NULL; else - for (oldaip=LIST_FIRST(&indirdep->ir_deplisthd); - oldaip; oldaip = LIST_NEXT(oldaip, ai_next)) + + LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next) if (oldaip->ai_offset == aip->ai_offset) break; + freefrag = NULL; if (oldaip != NULL) { if (oldaip->ai_newblkno != aip->ai_oldblkno) panic("setup_allocindir_phase2: blkno"); aip->ai_oldblkno = oldaip->ai_oldblkno; - freefrag = oldaip->ai_freefrag; - oldaip->ai_freefrag = aip->ai_freefrag; - aip->ai_freefrag = freefrag; + freefrag = aip->ai_freefrag; + aip->ai_freefrag = oldaip->ai_freefrag; + oldaip->ai_freefrag = NULL; free_allocindir(oldaip, NULL); } LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next); ((ufs_daddr_t *)indirdep->ir_savebp->b_data) [aip->ai_offset] = aip->ai_oldblkno; FREE_LOCK(&lk); + if (freefrag != NULL) + handle_workitem_freefrag(freefrag); } if (newindirdep) { if (indirdep->ir_savebp != NULL) @@ -1470,16 +1637,21 @@ setup_allocindir_phase2(bp, ip, aip) if (indirdep) break; MALLOC(newindirdep, struct indirdep *, sizeof(struct indirdep), - M_INDIRDEP, M_WAITOK); + M_INDIRDEP, M_SOFTDEP_FLAGS); newindirdep->ir_list.wk_type = D_INDIRDEP; newindirdep->ir_state = ATTACHED; LIST_INIT(&newindirdep->ir_deplisthd); LIST_INIT(&newindirdep->ir_donehd); - newindirdep->ir_saveddata = (ufs_daddr_t *)bp->b_data; + if (bp->b_blkno == bp->b_lblkno) { + VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, + NULL); + } newindirdep->ir_savebp = getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0); - bcopy((caddr_t)newindirdep->ir_saveddata, - newindirdep->ir_savebp->b_data, bp->b_bcount); +#if 0 + BUF_KERNPROC(newindirdep->ir_savebp); +#endif + bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount); } } @@ -1512,7 +1684,6 @@ setup_allocindir_phase2(bp, ip, aip) * later release and zero the inode so that the calling routine * can release it. */ -static long num_freeblks; /* number of freeblks allocated */ void softdep_setup_freeblocks(ip, length) struct inode *ip; /* The inode whose length is to be reduced */ @@ -1524,25 +1695,19 @@ softdep_setup_freeblocks(ip, length) struct vnode *vp; struct buf *bp; struct fs *fs; - int i, error; + int i, delay, error; fs = ip->i_fs; if (length != 0) - panic("softde_setup_freeblocks: non-zero length"); - /* - * If we are over our limit, try to improve the situation. - */ - if (num_freeblks > max_softdeps / 2 && speedup_syncer() == 0) - (void) request_cleanup(FLUSH_REMOVE, 0); - num_freeblks += 1; + panic("softdep_setup_freeblocks: non-zero length"); MALLOC(freeblks, struct freeblks *, sizeof(struct freeblks), - M_FREEBLKS, M_WAITOK); + M_FREEBLKS, M_SOFTDEP_FLAGS); bzero(freeblks, sizeof(struct freeblks)); freeblks->fb_list.wk_type = D_FREEBLKS; freeblks->fb_uid = ip->i_ffs_uid; freeblks->fb_previousinum = ip->i_number; freeblks->fb_devvp = ip->i_devvp; - freeblks->fb_fs = fs; + freeblks->fb_mnt = ITOV(ip)->v_mount; freeblks->fb_oldsize = ip->i_ffs_size; freeblks->fb_newsize = length; freeblks->fb_chkcnt = ip->i_ffs_blocks; @@ -1576,19 +1741,26 @@ softdep_setup_freeblocks(ip, length) panic("softdep_setup_freeblocks: inode busy"); /* * Add the freeblks structure to the list of operations that - * must await the zero'ed inode being written to disk. + * must await the zero'ed inode being written to disk. If we + * still have a bitmap dependency (delay == 0), then the inode + * has never been written to disk, so we can process the + * freeblks below once we have deleted the dependencies. */ - WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list); + delay = (inodedep->id_state & DEPCOMPLETE); + if (delay) + WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list); /* * Because the file length has been truncated to zero, any * pending block allocation dependency structures associated * with this inode are obsolete and can simply be de-allocated. * We must first merge the two dependency lists to get rid of * any duplicate freefrag structures, then purge the merged list. + * If we still have a bitmap dependency, then the inode has never + * been written to disk, so we can free any fragments without delay. */ merge_inode_lists(inodedep); while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0) - free_allocdirect(&inodedep->id_inoupdt, adp, 1); + free_allocdirect(&inodedep->id_inoupdt, adp, delay); FREE_LOCK(&lk); bdwrite(bp); /* @@ -1604,17 +1776,21 @@ softdep_setup_freeblocks(ip, length) bp = LIST_FIRST(&vp->v_dirtyblkhd); (void) inodedep_lookup(fs, ip->i_number, 0, &inodedep); deallocate_dependencies(bp, inodedep); - bp->b_flags |= B_INVAL; + bp->b_flags |= B_INVAL | B_NOCACHE; FREE_LOCK(&lk); brelse(bp); ACQUIRE_LOCK(&lk); } - /* - * Try freeing the inodedep in case that was the last dependency. - */ - if ((inodedep_lookup(fs, ip->i_number, 0, &inodedep)) != 0) + if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) != 0) (void) free_inodedep(inodedep); FREE_LOCK(&lk); + /* + * If the inode has never been written to disk (delay == 0), + * then we can process the freeblks now that we have deleted + * the dependencies. + */ + if (!delay) + handle_workitem_freeblocks(freeblks); } /* @@ -1624,7 +1800,7 @@ softdep_setup_freeblocks(ip, length) * its associated dependencies. The mutex is held so that other I/O's * associated with related dependencies do not occur. */ -static void +STATIC void deallocate_dependencies(bp, inodedep) struct buf *bp; struct inodedep *inodedep; @@ -1688,11 +1864,12 @@ deallocate_dependencies(bp, inodedep) * If the inode has already been written, then they * can be dumped directly onto the work list. */ - for (dirrem = LIST_FIRST(&pagedep->pd_dirremhd); dirrem; - dirrem = LIST_NEXT(dirrem, dm_next)) { + LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) { LIST_REMOVE(dirrem, dm_next); dirrem->dm_dirinum = pagedep->pd_ino; - if (inodedep == NULL) + if (inodedep == NULL || + (inodedep->id_state & ALLCOMPLETE) == + ALLCOMPLETE) add_to_worklist(&dirrem->dm_list); else WORKLIST_INSERT(&inodedep->id_bufwait, @@ -1725,7 +1902,7 @@ deallocate_dependencies(bp, inodedep) * Free an allocdirect. Generate a new freefrag work request if appropriate. * This routine must be called with splbio interrupts blocked. */ -static void +STATIC void free_allocdirect(adphead, adp, delay) struct allocdirectlst *adphead; struct allocdirect *adp; @@ -1755,79 +1932,91 @@ free_allocdirect(adphead, adp, delay) * Prepare an inode to be freed. The actual free operation is not * done until the zero'ed inode has been written to disk. */ -static long num_freefile; /* number of freefile allocated */ void -softdep_freefile(ap) - struct vop_vfree_args /* { - struct vnode *a_pvp; - ino_t a_ino; - int a_mode; - } */ *ap; +softdep_freefile(pvp, ino, mode) + struct vnode *pvp; + ino_t ino; + int mode; { - struct inode *ip = VTOI(ap->a_pvp); + struct inode *ip = VTOI(pvp); struct inodedep *inodedep; struct freefile *freefile; /* - * If we are over our limit, try to improve the situation. - */ - if (num_freefile > max_softdeps / 2 && speedup_syncer() == 0) - (void) request_cleanup(FLUSH_REMOVE, 0); - /* * This sets up the inode de-allocation dependency. */ - num_freefile += 1; MALLOC(freefile, struct freefile *, sizeof(struct freefile), - M_FREEFILE, M_WAITOK); + M_FREEFILE, M_SOFTDEP_FLAGS); freefile->fx_list.wk_type = D_FREEFILE; freefile->fx_list.wk_state = 0; - freefile->fx_mode = ap->a_mode; - freefile->fx_oldinum = ap->a_ino; + freefile->fx_mode = mode; + freefile->fx_oldinum = ino; freefile->fx_devvp = ip->i_devvp; - freefile->fx_fs = ip->i_fs; + freefile->fx_mnt = ITOV(ip)->v_mount; /* * If the inodedep does not exist, then the zero'ed inode has - * been written to disk and we can free the file immediately. + * been written to disk. If the allocated inode has never been + * written to disk, then the on-disk inode is zero'ed. In either + * case we can free the file immediately. */ ACQUIRE_LOCK(&lk); - if (inodedep_lookup(ip->i_fs, ap->a_ino, 0, &inodedep) == 0) { - add_to_worklist(&freefile->fx_list); + if (inodedep_lookup(ip->i_fs, ino, 0, &inodedep) == 0 || + check_inode_unwritten(inodedep)) { FREE_LOCK(&lk); + handle_workitem_freefile(freefile); return; } + WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list); + FREE_LOCK(&lk); +} - /* - * If we still have a bitmap dependency, then the inode has never - * been written to disk. Drop the dependency as it is no longer - * necessary since the inode is being deallocated. We could process - * the freefile immediately, but then we would have to clear the - * id_inowait dependencies here and it is easier just to let the - * zero'ed inode be written and let them be cleaned up in the - * normal followup actions that follow the inode write. - */ - if ((inodedep->id_state & DEPCOMPLETE) == 0) { - inodedep->id_state |= DEPCOMPLETE; - LIST_REMOVE(inodedep, id_deps); - inodedep->id_buf = NULL; +/* + * Check to see if an inode has never been written to disk. If + * so free the inodedep and return success, otherwise return failure. + * This routine must be called with splbio interrupts blocked. + * + * If we still have a bitmap dependency, then the inode has never + * been written to disk. Drop the dependency as it is no longer + * necessary since the inode is being deallocated. We set the + * ALLCOMPLETE flags since the bitmap now properly shows that the + * inode is not allocated. Even if the inode is actively being + * written, it has been rolled back to its zero'ed state, so we + * are ensured that a zero inode is what is on the disk. For short + * lived files, this change will usually result in removing all the + * dependencies from the inode so that it can be freed immediately. + */ +STATIC int +check_inode_unwritten(inodedep) + struct inodedep *inodedep; +{ + + if ((inodedep->id_state & DEPCOMPLETE) != 0 || + LIST_FIRST(&inodedep->id_pendinghd) != NULL || + LIST_FIRST(&inodedep->id_bufwait) != NULL || + LIST_FIRST(&inodedep->id_inowait) != NULL || + TAILQ_FIRST(&inodedep->id_inoupdt) != NULL || + TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL || + inodedep->id_nlinkdelta != 0) + return (0); + inodedep->id_state |= ALLCOMPLETE; + LIST_REMOVE(inodedep, id_deps); + inodedep->id_buf = NULL; + if (inodedep->id_state & ONWORKLIST) + WORKLIST_REMOVE(&inodedep->id_list); + if (inodedep->id_savedino != NULL) { + FREE(inodedep->id_savedino, M_INODEDEP); + inodedep->id_savedino = NULL; } - /* - * If the inodedep has no dependencies associated with it, - * then we must free it here and free the file immediately. - * This case arises when an early allocation fails (for - * example, the user is over their file quota). - */ if (free_inodedep(inodedep) == 0) - WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list); - else - add_to_worklist(&freefile->fx_list); - FREE_LOCK(&lk); + panic("check_inode_unwritten: busy inode"); + return (1); } /* * Try to free an inodedep structure. Return 1 if it could be freed. */ -static int +STATIC int free_inodedep(inodedep) struct inodedep *inodedep; { @@ -1855,7 +2044,7 @@ free_inodedep(inodedep) * to the number of blocks allocated for the file) are also * performed in this function. */ -static void +STATIC void handle_workitem_freeblocks(freeblks) struct freeblks *freeblks; { @@ -1867,13 +2056,13 @@ handle_workitem_freeblocks(freeblks) int error, allerror = 0; ufs_lbn_t baselbns[NIADDR], tmpval; + tip.i_fs = fs = VFSTOUFS(freeblks->fb_mnt)->um_fs; tip.i_number = freeblks->fb_previousinum; tip.i_devvp = freeblks->fb_devvp; tip.i_dev = freeblks->fb_devvp->v_rdev; - tip.i_fs = freeblks->fb_fs; tip.i_ffs_size = freeblks->fb_oldsize; tip.i_ffs_uid = freeblks->fb_uid; - fs = freeblks->fb_fs; + tip.i_vnode = NULL; tmpval = 1; baselbns[0] = NDADDR; for (i = 1; i < NIADDR; i++) { @@ -1907,12 +2096,11 @@ handle_workitem_freeblocks(freeblks) #ifdef DIAGNOSTIC if (freeblks->fb_chkcnt != blocksreleased) - panic("handle_workitem_freeblocks: block count"); + printf("handle_workitem_freeblocks: block count"); if (allerror) softdep_error("handle_workitem_freeblks", allerror); #endif /* DIAGNOSTIC */ WORKITEM_FREE(freeblks, D_FREEBLKS); - num_freeblks -= 1; } /* @@ -1921,7 +2109,7 @@ handle_workitem_freeblocks(freeblks) * and recursive calls to indirtrunc must be used to cleanse other indirect * blocks. */ -static int +STATIC int indir_trunc(ip, dbn, level, lbn, countp) struct inode *ip; ufs_daddr_t dbn; @@ -1988,7 +2176,7 @@ indir_trunc(ip, dbn, level, lbn, countp) ffs_blkfree(ip, nb, fs->fs_bsize); *countp += nblocks; } - bp->b_flags |= B_INVAL; + bp->b_flags |= B_INVAL | B_NOCACHE; brelse(bp); return (allerror); } @@ -1997,7 +2185,7 @@ indir_trunc(ip, dbn, level, lbn, countp) * Free an allocindir. * This routine must be called with splbio interrupts blocked. */ -static void +STATIC void free_allocindir(aip, inodedep) struct allocindir *aip; struct inodedep *inodedep; @@ -2074,8 +2262,9 @@ softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp) fs = dp->i_fs; lbn = lblkno(fs, diroffset); offset = blkoff(fs, diroffset); - MALLOC(dap, struct diradd *, sizeof(struct diradd), M_DIRADD, M_WAITOK); - bzero(dap, sizeof(struct diradd)); + MALLOC(dap, struct diradd *, sizeof(struct diradd), M_DIRADD, + M_SOFTDEP_FLAGS); + bzero(dap,sizeof(struct diradd)); dap->da_list.wk_type = D_DIRADD; dap->da_offset = offset; dap->da_newinum = newinum; @@ -2086,12 +2275,12 @@ softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp) } else { dap->da_state |= MKDIR_BODY | MKDIR_PARENT; MALLOC(mkdir1, struct mkdir *, sizeof(struct mkdir), M_MKDIR, - M_WAITOK); + M_SOFTDEP_FLAGS); mkdir1->md_list.wk_type = D_MKDIR; mkdir1->md_state = MKDIR_BODY; mkdir1->md_diradd = dap; MALLOC(mkdir2, struct mkdir *, sizeof(struct mkdir), M_MKDIR, - M_WAITOK); + M_SOFTDEP_FLAGS); mkdir2->md_list.wk_type = D_MKDIR; mkdir2->md_state = MKDIR_PARENT; mkdir2->md_diradd = dap; @@ -2165,8 +2354,8 @@ softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize) goto done; oldoffset = offset + (oldloc - base); newoffset = offset + (newloc - base); - for (dap = LIST_FIRST(&pagedep->pd_diraddhd[DIRADDHASH(oldoffset)]); - dap; dap = LIST_NEXT(dap, da_pdlist)) { + + LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(oldoffset)], da_pdlist) { if (dap->da_offset != oldoffset) continue; dap->da_offset = newoffset; @@ -2178,8 +2367,8 @@ softdep_change_directoryentry_offset(dp, base, oldloc, newloc, entrysize) break; } if (dap == NULL) { - for (dap = LIST_FIRST(&pagedep->pd_pendinghd); - dap; dap = LIST_NEXT(dap, da_pdlist)) { + + LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) { if (dap->da_offset == oldoffset) { dap->da_offset = newoffset; break; @@ -2195,7 +2384,7 @@ done: * Free a diradd dependency structure. This routine must be called * with splbio interrupts blocked. */ -static void +STATIC void free_diradd(dap) struct diradd *dap; { @@ -2261,32 +2450,50 @@ softdep_setup_remove(bp, dp, ip, isrmdir) struct inode *ip; /* inode for directory entry being removed */ int isrmdir; /* indicates if doing RMDIR */ { - struct dirrem *dirrem; + struct dirrem *dirrem, *prevdirrem; /* * Allocate a new dirrem if appropriate and ACQUIRE_LOCK. */ - dirrem = newdirrem(bp, dp, ip, isrmdir); + dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem); + + /* + * If the COMPLETE flag is clear, then there were no active + * entries and we want to roll back to a zeroed entry until + * the new inode is committed to disk. If the COMPLETE flag is + * set then we have deleted an entry that never made it to + * disk. If the entry we deleted resulted from a name change, + * then the old name still resides on disk. We cannot delete + * its inode (returned to us in prevdirrem) until the zeroed + * directory entry gets to disk. The new inode has never been + * referenced on the disk, so can be deleted immediately. + */ if ((dirrem->dm_state & COMPLETE) == 0) { LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem, dm_next); + FREE_LOCK(&lk); } else { + if (prevdirrem != NULL) + LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, + prevdirrem, dm_next); dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino; - add_to_worklist(&dirrem->dm_list); + FREE_LOCK(&lk); + handle_workitem_remove(dirrem); } - FREE_LOCK(&lk); } /* * Allocate a new dirrem if appropriate and return it along with * its associated pagedep. Called without a lock, returns with lock. */ -static struct dirrem * -newdirrem(bp, dp, ip, isrmdir) +STATIC long num_dirrem; /* number of dirrem allocated */ +STATIC struct dirrem * +newdirrem(bp, dp, ip, isrmdir, prevdirremp) struct buf *bp; /* buffer containing directory block */ struct inode *dp; /* inode for the directory being modified */ struct inode *ip; /* inode for directory entry being removed */ int isrmdir; /* indicates if doing RMDIR */ + struct dirrem **prevdirremp; /* previously referenced inode, if any */ { int offset; ufs_lbn_t lbn; @@ -2299,13 +2506,22 @@ newdirrem(bp, dp, ip, isrmdir) */ if (ip == NULL) panic("newdirrem: whiteout"); + /* + * If we are over our limit, try to improve the situation. + * Limiting the number of dirrem structures will also limit + * the number of freefile and freeblks structures. + */ + if (num_dirrem > max_softdeps / 2) + (void) request_cleanup(FLUSH_REMOVE, 0); + num_dirrem += 1; MALLOC(dirrem, struct dirrem *, sizeof(struct dirrem), - M_DIRREM, M_WAITOK); - bzero(dirrem, sizeof(struct dirrem)); + M_DIRREM, M_SOFTDEP_FLAGS); + bzero(dirrem,sizeof(struct dirrem)); dirrem->dm_list.wk_type = D_DIRREM; dirrem->dm_state = isrmdir ? RMDIR : 0; dirrem->dm_mnt = ITOV(ip)->v_mount; dirrem->dm_oldinum = ip->i_number; + *prevdirremp = NULL; ACQUIRE_LOCK(&lk); lbn = lblkno(dp->i_fs, dp->i_offset); @@ -2319,28 +2535,42 @@ newdirrem(bp, dp, ip, isrmdir) * be de-allocated. Check for an entry on both the pd_dirraddhd * list and the pd_pendinghd list. */ - for (dap = LIST_FIRST(&pagedep->pd_diraddhd[DIRADDHASH(offset)]); - dap; dap = LIST_NEXT(dap, da_pdlist)) + + LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist) if (dap->da_offset == offset) break; if (dap == NULL) { - for (dap = LIST_FIRST(&pagedep->pd_pendinghd); - dap; dap = LIST_NEXT(dap, da_pdlist)) + + LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) if (dap->da_offset == offset) break; if (dap == NULL) return (dirrem); } /* - * Must be ATTACHED at this point, so just delete it. + * Must be ATTACHED at this point. */ if ((dap->da_state & ATTACHED) == 0) panic("newdirrem: not ATTACHED"); if (dap->da_newinum != ip->i_number) panic("newdirrem: inum %d should be %d", ip->i_number, dap->da_newinum); - free_diradd(dap); + /* + * If we are deleting a changed name that never made it to disk, + * then return the dirrem describing the previous inode (which + * represents the inode currently referenced from this entry on disk). + */ + if ((dap->da_state & DIRCHG) != 0) { + *prevdirremp = dap->da_previous; + dap->da_state &= ~DIRCHG; + dap->da_pagedep = pagedep; + } + /* + * We are deleting an entry that never made it to disk. + * Mark it COMPLETE so we can delete its inode immediately. + */ dirrem->dm_state |= COMPLETE; + free_diradd(dap); return (dirrem); } @@ -2371,7 +2601,7 @@ softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) { int offset; struct diradd *dap = NULL; - struct dirrem *dirrem; + struct dirrem *dirrem, *prevdirrem; struct pagedep *pagedep; struct inodedep *inodedep; @@ -2382,8 +2612,8 @@ softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) */ if (newinum != WINO) { MALLOC(dap, struct diradd *, sizeof(struct diradd), - M_DIRADD, M_WAITOK); - bzero(dap, sizeof(struct diradd)); + M_DIRADD, M_SOFTDEP_FLAGS); + bzero(dap,sizeof(struct diradd)); dap->da_list.wk_type = D_DIRADD; dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE; dap->da_offset = offset; @@ -2393,7 +2623,7 @@ softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) /* * Allocate a new dirrem and ACQUIRE_LOCK. */ - dirrem = newdirrem(bp, dp, ip, isrmdir); + dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem); pagedep = dirrem->dm_pagedep; /* * The possible values for isrmdir: @@ -2427,11 +2657,35 @@ softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) } /* + * If the COMPLETE flag is clear, then there were no active + * entries and we want to roll back to the previous inode until + * the new inode is committed to disk. If the COMPLETE flag is + * set, then we have deleted an entry that never made it to disk. + * If the entry we deleted resulted from a name change, then the old + * inode reference still resides on disk. Any rollback that we do + * needs to be to that old inode (returned to us in prevdirrem). If + * the entry we deleted resulted from a create, then there is + * no entry on the disk, so we want to roll back to zero rather + * than the uncommitted inode. In either of the COMPLETE cases we + * want to immediately free the unwritten and unreferenced inode. + */ + if ((dirrem->dm_state & COMPLETE) == 0) { + dap->da_previous = dirrem; + } else { + if (prevdirrem != NULL) { + dap->da_previous = prevdirrem; + } else { + dap->da_state &= ~DIRCHG; + dap->da_pagedep = pagedep; + } + dirrem->dm_dirinum = pagedep->pd_ino; + add_to_worklist(&dirrem->dm_list); + } + /* * Link into its inodedep. Put it on the id_bufwait list if the inode * is not yet written. If it is written, do the post-inode write * processing to put it on the id_pendinghd list. */ - dap->da_previous = dirrem; if (inodedep_lookup(dp->i_fs, newinum, DEPALLOC, &inodedep) == 0 || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { dap->da_state |= COMPLETE; @@ -2442,35 +2696,26 @@ softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) dap, da_pdlist); WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); } - /* - * If the previous inode was never written or its previous directory - * entry was never written, then we do not want to roll back to this - * previous value. Instead we want to roll back to zero and immediately - * free the unwritten or unreferenced inode. - */ - if (dirrem->dm_state & COMPLETE) { - dap->da_state &= ~DIRCHG; - dap->da_pagedep = pagedep; - dirrem->dm_dirinum = pagedep->pd_ino; - add_to_worklist(&dirrem->dm_list); - } FREE_LOCK(&lk); } /* - * Called whenever the link count on an inode is increased. + * Called whenever the link count on an inode is changed. * It creates an inode dependency so that the new reference(s) * to the inode cannot be committed to disk until the updated * inode has been written. */ void -softdep_increase_linkcnt(ip) +softdep_change_linkcnt(ip) struct inode *ip; /* the inode with the increased link count */ { struct inodedep *inodedep; ACQUIRE_LOCK(&lk); (void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, &inodedep); + if (ip->i_ffs_nlink < ip->i_effnlink) + panic("softdep_change_linkcnt: bad delta"); + inodedep->id_nlinkdelta = ip->i_ffs_nlink - ip->i_effnlink; FREE_LOCK(&lk); } @@ -2478,7 +2723,7 @@ softdep_increase_linkcnt(ip) * This workitem decrements the inode's link count. * If the link count reaches zero, the file is removed. */ -static void +STATIC void handle_workitem_remove(dirrem) struct dirrem *dirrem; { @@ -2486,6 +2731,7 @@ handle_workitem_remove(dirrem) struct inodedep *inodedep; struct vnode *vp; struct inode *ip; + ino_t oldinum; int error; if ((error = VFS_VGET(dirrem->dm_mnt, dirrem->dm_oldinum, &vp)) != 0) { @@ -2493,15 +2739,21 @@ handle_workitem_remove(dirrem) return; } ip = VTOI(vp); + ACQUIRE_LOCK(&lk); + if ((inodedep_lookup(ip->i_fs, dirrem->dm_oldinum, 0, &inodedep)) == 0) + panic("handle_workitem_remove: lost inodedep"); /* * Normal file deletion. */ if ((dirrem->dm_state & RMDIR) == 0) { ip->i_ffs_nlink--; + ip->i_flag |= IN_CHANGE; if (ip->i_ffs_nlink < ip->i_effnlink) panic("handle_workitem_remove: bad file delta"); - ip->i_flag |= IN_CHANGE; + inodedep->id_nlinkdelta = ip->i_ffs_nlink - ip->i_effnlink; + FREE_LOCK(&lk); vput(vp); + num_dirrem -= 1; WORKITEM_FREE(dirrem, D_DIRREM); return; } @@ -2513,9 +2765,11 @@ handle_workitem_remove(dirrem) * the parent decremented to account for the loss of "..". */ ip->i_ffs_nlink -= 2; + ip->i_flag |= IN_CHANGE; if (ip->i_ffs_nlink < ip->i_effnlink) panic("handle_workitem_remove: bad dir delta"); - ip->i_flag |= IN_CHANGE; + inodedep->id_nlinkdelta = ip->i_ffs_nlink - ip->i_effnlink; + FREE_LOCK(&lk); if ((error = VOP_TRUNCATE(vp, (off_t)0, 0, p->p_ucred, p)) != 0) softdep_error("handle_workitem_remove: truncate", error); /* @@ -2525,14 +2779,27 @@ handle_workitem_remove(dirrem) */ if (dirrem->dm_state & DIRCHG) { vput(vp); + num_dirrem -= 1; WORKITEM_FREE(dirrem, D_DIRREM); return; } + /* + * If the inodedep does not exist, then the zero'ed inode has + * been written to disk. If the allocated inode has never been + * written to disk, then the on-disk inode is zero'ed. In either + * case we can remove the file immediately. + */ ACQUIRE_LOCK(&lk); - (void) inodedep_lookup(ip->i_fs, dirrem->dm_oldinum, DEPALLOC, - &inodedep); dirrem->dm_state = 0; + oldinum = dirrem->dm_oldinum; dirrem->dm_oldinum = dirrem->dm_dirinum; + if (inodedep_lookup(ip->i_fs, oldinum, 0, &inodedep) == 0 || + check_inode_unwritten(inodedep)) { + FREE_LOCK(&lk); + vput(vp); + handle_workitem_remove(dirrem); + return; + } WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list); FREE_LOCK(&lk); vput(vp); @@ -2552,33 +2819,39 @@ handle_workitem_remove(dirrem) * procedure above (softdep_setup_freeblocks) and completed by the * following procedure. */ -static void +STATIC void handle_workitem_freefile(freefile) struct freefile *freefile; { + struct fs *fs; struct vnode vp; struct inode tip; struct inodedep *idp; - struct vop_vfree_args args; int error; + fs = VFSTOUFS(freefile->fx_mnt)->um_fs; #ifdef DEBUG ACQUIRE_LOCK(&lk); - if (inodedep_lookup(freefile->fx_fs, freefile->fx_oldinum, 0, &idp)) + if (inodedep_lookup(fs, freefile->fx_oldinum, 0, &idp)) panic("handle_workitem_freefile: inodedep survived"); FREE_LOCK(&lk); #endif tip.i_devvp = freefile->fx_devvp; tip.i_dev = freefile->fx_devvp->v_rdev; - tip.i_fs = freefile->fx_fs; + tip.i_fs = fs; + tip.i_vnode = &vp; vp.v_data = &tip; - args.a_pvp = &vp; - args.a_ino = freefile->fx_oldinum; - args.a_mode = freefile->fx_mode; - if ((error = ffs_freefile(&args)) != 0) - softdep_error("handle_workitem_freefile", error); + { + struct vop_vfree_args vargs; + + vargs.a_pvp = &vp; + vargs.a_ino = freefile->fx_oldinum; + vargs.a_mode = freefile->fx_mode; + + if ((error = ffs_freefile(&vargs)) != 0) + softdep_error("handle_workitem_freefile", error); + } WORKITEM_FREE(freefile, D_FREEFILE); - num_freefile -= 1; } /* @@ -2641,7 +2914,7 @@ softdep_disk_io_initiation(bp) * dependency can be freed. */ if (LIST_FIRST(&indirdep->ir_deplisthd) == NULL) { - indirdep->ir_savebp->b_flags |= B_INVAL; + indirdep->ir_savebp->b_flags |= B_INVAL | B_NOCACHE; brelse(indirdep->ir_savebp); /* inline expand WORKLIST_REMOVE(wk); */ wk->wk_state &= ~ONWORKLIST; @@ -2652,10 +2925,14 @@ softdep_disk_io_initiation(bp) /* * Replace up-to-date version with safe version. */ + MALLOC(indirdep->ir_saveddata, caddr_t, bp->b_bcount, + M_INDIRDEP, M_SOFTDEP_FLAGS); ACQUIRE_LOCK(&lk); indirdep->ir_state &= ~ATTACHED; indirdep->ir_state |= UNDONE; - bp->b_data = indirdep->ir_savebp->b_data; + bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount); + bcopy(indirdep->ir_savebp->b_data, bp->b_data, + bp->b_bcount); FREE_LOCK(&lk); continue; @@ -2679,7 +2956,7 @@ softdep_disk_io_initiation(bp) * thus, no I/O completion operations can occur while we are * manipulating its associated dependencies. */ -static void +STATIC void initiate_write_filepage(pagedep, bp) struct pagedep *pagedep; struct buf *bp; @@ -2700,8 +2977,7 @@ initiate_write_filepage(pagedep, bp) pagedep->pd_state |= IOSTARTED; ACQUIRE_LOCK(&lk); for (i = 0; i < DAHASHSZ; i++) { - for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap; - dap = LIST_NEXT(dap, da_pdlist)) { + LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { ep = (struct direct *) ((char *)bp->b_data + dap->da_offset); if (ep->d_ino != dap->da_newinum) @@ -2725,7 +3001,7 @@ initiate_write_filepage(pagedep, bp) * locked, thus, no I/O completion operations can occur while we * are manipulating its associated dependencies. */ -static void +STATIC void initiate_write_inodeblock(inodedep, bp) struct inodedep *inodedep; struct buf *bp; /* The inode block */ @@ -2750,7 +3026,7 @@ initiate_write_inodeblock(inodedep, bp) if (inodedep->id_savedino != NULL) panic("initiate_write_inodeblock: already doing I/O"); MALLOC(inodedep->id_savedino, struct dinode *, - sizeof(struct dinode), M_INODEDEP, M_WAITOK); + sizeof(struct dinode), M_INODEDEP, M_SOFTDEP_FLAGS); *inodedep->id_savedino = *dp; bzero((caddr_t)dp, sizeof(struct dinode)); return; @@ -2942,7 +3218,9 @@ softdep_disk_write_complete(bp) indirdep = WK_INDIRDEP(wk); if (indirdep->ir_state & GOINGAWAY) panic("disk_write_complete: indirdep gone"); - bp->b_data = (caddr_t)indirdep->ir_saveddata; + bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount); + FREE(indirdep->ir_saveddata, M_INDIRDEP); + indirdep->ir_saveddata = 0; indirdep->ir_state &= ~UNDONE; indirdep->ir_state |= ATTACHED; while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) { @@ -2981,13 +3259,13 @@ softdep_disk_write_complete(bp) * this routine is always called from interrupt level with further * splbio interrupts blocked. */ -static void +STATIC void handle_allocdirect_partdone(adp) struct allocdirect *adp; /* the completed allocdirect */ { struct allocdirect *listadp; struct inodedep *inodedep; - long bsize; + long bsize, delay; if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) return; @@ -3004,8 +3282,7 @@ handle_allocdirect_partdone(adp) */ inodedep = adp->ad_inodedep; bsize = inodedep->id_fs->fs_bsize; - for (listadp = TAILQ_FIRST(&inodedep->id_inoupdt); listadp; - listadp = TAILQ_NEXT(listadp, ad_next)) { + TAILQ_FOREACH(listadp, &inodedep->id_inoupdt, ad_next) { /* found our block */ if (listadp == adp) break; @@ -3024,8 +3301,7 @@ handle_allocdirect_partdone(adp) */ if (listadp == NULL) { #ifdef DEBUG - for (listadp = TAILQ_FIRST(&inodedep->id_newinoupdt); listadp; - listadp = TAILQ_NEXT(listadp, ad_next)) + TAILQ_FOREACH(listadp, &inodedep->id_newinoupdt, ad_next) /* found our block */ if (listadp == adp) break; @@ -3037,12 +3313,16 @@ handle_allocdirect_partdone(adp) /* * If we have found the just finished dependency, then free * it along with anything that follows it that is complete. + * If the inode still has a bitmap dependency, then it has + * never been written to disk, hence the on-disk inode cannot + * reference the old fragment so we can free it without delay. */ + delay = (inodedep->id_state & DEPCOMPLETE); for (; adp; adp = listadp) { listadp = TAILQ_NEXT(adp, ad_next); if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) return; - free_allocdirect(&inodedep->id_inoupdt, adp, 1); + free_allocdirect(&inodedep->id_inoupdt, adp, delay); } } @@ -3051,7 +3331,7 @@ handle_allocdirect_partdone(adp) * this routine is always called from interrupt level with further * splbio interrupts blocked. */ -static void +STATIC void handle_allocindir_partdone(aip) struct allocindir *aip; /* the completed allocindir */ { @@ -3081,7 +3361,7 @@ handle_allocindir_partdone(aip) * that this routine is always called from interrupt level with further * splbio interrupts blocked. */ -static int +STATIC int handle_written_inodeblock(inodedep, bp) struct inodedep *inodedep; struct buf *bp; /* buffer containing the inode block */ @@ -3228,7 +3508,7 @@ handle_written_inodeblock(inodedep, bp) * Process a diradd entry after its dependent inode has been written. * This routine must be called with splbio interrupts blocked. */ -static void +STATIC void diradd_inode_written(dap, inodedep) struct diradd *dap; struct inodedep *inodedep; @@ -3250,7 +3530,7 @@ diradd_inode_written(dap, inodedep) /* * Handle the completion of a mkdir dependency. */ -static void +STATIC void handle_written_mkdir(mkdir, type) struct mkdir *mkdir; int type; @@ -3283,7 +3563,7 @@ handle_written_mkdir(mkdir, type) * Note that this routine is always called from interrupt level * with further splbio interrupts blocked. */ -static int +STATIC int handle_written_filepage(pagedep, bp) struct pagedep *pagedep; struct buf *bp; /* buffer containing the written page */ @@ -3396,12 +3676,7 @@ softdep_load_inodeblock(ip) FREE_LOCK(&lk); return; } - if (inodedep->id_nlinkdelta != 0) { - ip->i_effnlink -= inodedep->id_nlinkdelta; - ip->i_flag |= IN_MODIFIED; - inodedep->id_nlinkdelta = 0; - (void) free_inodedep(inodedep); - } + ip->i_effnlink -= inodedep->id_nlinkdelta; FREE_LOCK(&lk); } @@ -3433,16 +3708,14 @@ softdep_update_inodeblock(ip, bp, waitfor) * to track. */ ACQUIRE_LOCK(&lk); - if (ip->i_effnlink != ip->i_ffs_nlink) { - (void) inodedep_lookup(ip->i_fs, ip->i_number, DEPALLOC, - &inodedep); - } else if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) { + if (inodedep_lookup(ip->i_fs, ip->i_number, 0, &inodedep) == 0) { + if (ip->i_effnlink != ip->i_ffs_nlink) + panic("softdep_update_inodeblock: bad link count"); FREE_LOCK(&lk); return; } - if (ip->i_ffs_nlink < ip->i_effnlink) + if (inodedep->id_nlinkdelta != ip->i_ffs_nlink - ip->i_effnlink) panic("softdep_update_inodeblock: bad delta"); - inodedep->id_nlinkdelta = ip->i_ffs_nlink - ip->i_effnlink; /* * Changes have been initiated. Anything depending on these * changes cannot occur until this inode has been written. @@ -3482,7 +3755,8 @@ softdep_update_inodeblock(ip, bp, waitfor) } gotit = getdirtybuf(&inodedep->id_buf, MNT_WAIT); FREE_LOCK(&lk); - if (gotit && (error = VOP_BWRITE(inodedep->id_buf)) != 0) + if (gotit && + (error = bwrite(inodedep->id_buf)) != 0) softdep_error("softdep_update_inodeblock: bwrite", error); if ((inodedep->id_state & DEPCOMPLETE) == 0) panic("softdep_update_inodeblock: update failed"); @@ -3493,7 +3767,7 @@ softdep_update_inodeblock(ip, bp, waitfor) * inode dependency list (id_inoupdt). This routine must be called * with splbio interrupts blocked. */ -static void +STATIC void merge_inode_lists(inodedep) struct inodedep *inodedep; { @@ -3528,32 +3802,34 @@ int softdep_fsync(vp) struct vnode *vp; /* the "in_core" copy of the inode */ { - struct diradd *dap, *olddap; struct inodedep *inodedep; struct pagedep *pagedep; struct worklist *wk; + struct diradd *dap; struct mount *mnt; struct vnode *pvp; struct inode *ip; struct buf *bp; struct fs *fs; struct proc *p = CURPROC; /* XXX */ - int error, ret, flushparent; - struct timespec ts; + int error, flushparent; ino_t parentino; ufs_lbn_t lbn; + struct timespec ts; ip = VTOI(vp); fs = ip->i_fs; - for (error = 0, flushparent = 0, olddap = NULL; ; ) { - ACQUIRE_LOCK(&lk); - if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0) - break; - if (LIST_FIRST(&inodedep->id_inowait) != NULL || - LIST_FIRST(&inodedep->id_bufwait) != NULL || - TAILQ_FIRST(&inodedep->id_inoupdt) != NULL || - TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL) - panic("softdep_fsync: pending ops"); + ACQUIRE_LOCK(&lk); + if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0) { + FREE_LOCK(&lk); + return (0); + } + if (LIST_FIRST(&inodedep->id_inowait) != NULL || + LIST_FIRST(&inodedep->id_bufwait) != NULL || + TAILQ_FIRST(&inodedep->id_inoupdt) != NULL || + TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL) + panic("softdep_fsync: pending ops"); + for (error = 0, flushparent = 0; ; ) { if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL) break; if (wk->wk_type != D_DIRADD) @@ -3561,13 +3837,6 @@ softdep_fsync(vp) TYPENAME(wk->wk_type)); dap = WK_DIRADD(wk); /* - * If we have failed to get rid of all the dependencies - * then something is seriously wrong. - */ - if (dap == olddap) - panic("softdep_fsync: flush failed"); - olddap = dap; - /* * Flush our parent if this directory entry * has a MKDIR_PARENT dependency. */ @@ -3600,11 +3869,10 @@ softdep_fsync(vp) */ FREE_LOCK(&lk); VOP_UNLOCK(vp, 0, p); - if ((error = VFS_VGET(mnt, parentino, &pvp)) != 0) { - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); - return (error); - } + error = VFS_VGET(mnt, parentino, &pvp); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + if (error != 0) + return (error); if (flushparent) { TIMEVAL_TO_TIMESPEC(&time, &ts); if ((error = VOP_UPDATE(pvp, &ts, &ts, MNT_WAIT))) { @@ -3617,12 +3885,14 @@ softdep_fsync(vp) */ error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), p->p_ucred, &bp); - ret = VOP_BWRITE(bp); + if (error == 0) + error = bwrite(bp); vput(pvp); if (error != 0) return (error); - if (ret != 0) - return (ret); + ACQUIRE_LOCK(&lk); + if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0) + break; } FREE_LOCK(&lk); return (0); @@ -3640,8 +3910,8 @@ softdep_fsync_mountdev(vp) struct buf *bp, *nbp; struct worklist *wk; - if (vp->v_type != VBLK) - panic("softdep_fsync_mountdev: vnode not VBLK"); + if (!vn_isdisk(vp, NULL)) + panic("softdep_fsync_mountdev: vnode not a disk"); ACQUIRE_LOCK(&lk); for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { nbp = LIST_NEXT(bp, b_vnbufs); @@ -3650,6 +3920,8 @@ softdep_fsync_mountdev(vp) */ if (bp->b_flags & B_BUSY) continue; + bp->b_flags |= B_BUSY; + if ((bp->b_flags & B_DELWRI) == 0) panic("softdep_fsync_mountdev: not dirty"); /* @@ -3657,10 +3929,11 @@ softdep_fsync_mountdev(vp) * dependencies. */ if ((wk = LIST_FIRST(&bp->b_dep)) == NULL || - wk->wk_type != D_BMSAFEMAP) + wk->wk_type != D_BMSAFEMAP) { + bp->b_flags &= ~B_BUSY; continue; + } bremfree(bp); - bp->b_flags |= B_BUSY; FREE_LOCK(&lk); (void) bawrite(bp); ACQUIRE_LOCK(&lk); @@ -3701,7 +3974,7 @@ softdep_sync_metadata(ap) * Check whether this vnode is involved in a filesystem * that is doing soft dependency processing. */ - if (vp->v_type != VBLK) { + if (!vn_isdisk(vp, NULL)) { if (!DOINGSOFTDEP(vp)) return (0); } else @@ -3745,8 +4018,7 @@ loop: * As we hold the buffer locked, none of its dependencies * will disappear. */ - for (wk = LIST_FIRST(&bp->b_dep); wk; - wk = LIST_NEXT(wk, wk_list)) { + LIST_FOREACH(wk, &bp->b_dep, wk_list) { switch (wk->wk_type) { case D_ALLOCDIRECT: @@ -3785,8 +4057,8 @@ loop: case D_INDIRDEP: restart: - for (aip = LIST_FIRST(&WK_INDIRDEP(wk)->ir_deplisthd); - aip; aip = LIST_NEXT(aip, ai_next)) { + + LIST_FOREACH(aip, &WK_INDIRDEP(wk)->ir_deplisthd, ai_next) { if (aip->ai_state & DEPCOMPLETE) continue; nbp = aip->ai_buf; @@ -3926,7 +4198,8 @@ loop: * way to accomplish this is to sync the entire filesystem (luckily * this happens rarely). */ - if (vp->v_type == VBLK && vp->v_specmountpoint && !VOP_ISLOCKED(vp) && + if (vn_isdisk(vp, NULL) && + vp->v_specmountpoint && !VOP_ISLOCKED(vp) && (error = VFS_SYNC(vp->v_specmountpoint, MNT_WAIT, ap->a_cred, ap->a_p)) != 0) return (error); @@ -3937,7 +4210,7 @@ loop: * Flush the dependencies associated with an inodedep. * Called with splbio blocked. */ -static int +STATIC int flush_inodedep_deps(fs, ino) struct fs *fs; ino_t ino; @@ -3965,8 +4238,7 @@ flush_inodedep_deps(fs, ino) ACQUIRE_LOCK(&lk); if (inodedep_lookup(fs, ino, 0, &inodedep) == 0) return (0); - for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; - adp = TAILQ_NEXT(adp, ad_next)) { + TAILQ_FOREACH(adp, &inodedep->id_inoupdt, ad_next) { if (adp->ad_state & DEPCOMPLETE) continue; bp = adp->ad_buf; @@ -3987,8 +4259,7 @@ flush_inodedep_deps(fs, ino) } if (adp != NULL) continue; - for (adp = TAILQ_FIRST(&inodedep->id_newinoupdt); adp; - adp = TAILQ_NEXT(adp, ad_next)) { + TAILQ_FOREACH(adp, &inodedep->id_newinoupdt, ad_next) { if (adp->ad_state & DEPCOMPLETE) continue; bp = adp->ad_buf; @@ -4028,7 +4299,7 @@ flush_inodedep_deps(fs, ino) * Eliminate a pagedep dependency by flushing out all its diradd dependencies. * Called with splbio blocked. */ -static int +STATIC int flush_pagedep_deps(pvp, mp, diraddhdp) struct vnode *pvp; struct mount *mp; @@ -4062,84 +4333,85 @@ flush_pagedep_deps(pvp, mp, diraddhdp) if (dap != LIST_FIRST(diraddhdp)) continue; if (dap->da_state & MKDIR_PARENT) - panic("flush_pagedep_deps: MKDIR"); + panic("flush_pagedep_deps: MKDIR_PARENT"); } /* - * Flush the file on which the directory entry depends. - * If the inode has already been pushed out of the cache, - * then all the block dependencies will have been flushed - * leaving only inode dependencies (e.g., bitmaps). Thus, - * we do a ufs_ihashget to check for the vnode in the cache. - * If it is there, we do a full flush. If it is no longer - * there we need only dispose of any remaining bitmap - * dependencies and write the inode to disk. + * A newly allocated directory must have its "." and + * ".." entries written out before its name can be + * committed in its parent. We do not want or need + * the full semantics of a synchronous VOP_FSYNC as + * that may end up here again, once for each directory + * level in the filesystem. Instead, we push the blocks + * and wait for them to clear. We have to fsync twice + * because the first call may choose to defer blocks + * that still have dependencies, but deferral will + * happen at most once. */ inum = dap->da_newinum; - FREE_LOCK(&lk); - if ((vp = ufs_ihashget(ump->um_dev, inum)) == NULL) { + if (dap->da_state & MKDIR_BODY) { + FREE_LOCK(&lk); + if ((error = VFS_VGET(mp, inum, &vp)) != 0) + break; + if ((error=VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p)) || + (error=VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p))) { + vput(vp); + break; + } + drain_output(vp, 0); + vput(vp); ACQUIRE_LOCK(&lk); - if (inodedep_lookup(ump->um_fs, inum, 0, &inodedep) == 0 - && dap == LIST_FIRST(diraddhdp)) - panic("flush_pagedep_deps: flush 1 failed"); /* - * If the inode still has bitmap dependencies, - * push them to disk. + * If that cleared dependencies, go on to next. */ - if ((inodedep->id_state & DEPCOMPLETE) == 0) { - gotit = getdirtybuf(&inodedep->id_buf,MNT_WAIT); - FREE_LOCK(&lk); - if (gotit && - (error = VOP_BWRITE(inodedep->id_buf)) != 0) - break; - ACQUIRE_LOCK(&lk); - } if (dap != LIST_FIRST(diraddhdp)) continue; - /* - * If the inode is still sitting in a buffer waiting - * to be written, push it to disk. - */ + if (dap->da_state & MKDIR_BODY) + panic("flush_pagedep_deps: MKDIR_BODY"); + } + /* + * Flush the inode on which the directory entry depends. + * Having accounted for MKDIR_PARENT and MKDIR_BODY above, + * the only remaining dependency is that the updated inode + * count must get pushed to disk. The inode has already + * been pushed into its inode buffer (via VOP_UPDATE) at + * the time of the reference count change. So we need only + * locate that buffer, ensure that there will be no rollback + * caused by a bitmap dependency, then write the inode buffer. + */ + if (inodedep_lookup(ump->um_fs, inum, 0, &inodedep) == 0) + panic("flush_pagedep_deps: lost inode"); + /* + * If the inode still has bitmap dependencies, + * push them to disk. + */ + if ((inodedep->id_state & DEPCOMPLETE) == 0) { + gotit = getdirtybuf(&inodedep->id_buf, MNT_WAIT); FREE_LOCK(&lk); - if ((error = bread(ump->um_devvp, - fsbtodb(ump->um_fs, ino_to_fsba(ump->um_fs, inum)), - (int)ump->um_fs->fs_bsize, NOCRED, &bp)) != 0) - break; - if ((error = VOP_BWRITE(bp)) != 0) + if (gotit && + (error = bwrite(inodedep->id_buf)) != 0) break; ACQUIRE_LOCK(&lk); - if (dap == LIST_FIRST(diraddhdp)) - panic("flush_pagedep_deps: flush 2 failed"); - continue; - } - if (vp->v_type == VDIR) { - /* - * A newly allocated directory must have its "." and - * ".." entries written out before its name can be - * committed in its parent. We do not want or need - * the full semantics of a synchronous VOP_FSYNC as - * that may end up here again, once for each directory - * level in the filesystem. Instead, we push the blocks - * and wait for them to clear. - */ - if ((error = - VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p))) { - vput(vp); - break; - } - drain_output(vp, 0); + if (dap != LIST_FIRST(diraddhdp)) + continue; } - TIMEVAL_TO_TIMESPEC(&time, &ts); - error = VOP_UPDATE(vp, &ts, &ts, MNT_WAIT); - vput(vp); - if (error) + /* + * If the inode is still sitting in a buffer waiting + * to be written, push it to disk. + */ + FREE_LOCK(&lk); + if ((error = bread(ump->um_devvp, + fsbtodb(ump->um_fs, ino_to_fsba(ump->um_fs, inum)), + (int)ump->um_fs->fs_bsize, NOCRED, &bp)) != 0) + break; + if ((error = bwrite(bp)) != 0) break; + ACQUIRE_LOCK(&lk); /* * If we have failed to get rid of all the dependencies * then something is seriously wrong. */ if (dap == LIST_FIRST(diraddhdp)) - panic("flush_pagedep_deps: flush 3 failed"); - ACQUIRE_LOCK(&lk); + panic("flush_pagedep_deps: flush failed"); } if (error) ACQUIRE_LOCK(&lk); @@ -4148,17 +4420,35 @@ flush_pagedep_deps(pvp, mp, diraddhdp) /* * A large burst of file addition or deletion activity can drive the - * memory load excessively high. Therefore we deliberately slow things - * down and speed up the I/O processing if we find ourselves with too - * many dependencies in progress. + * memory load excessively high. First attempt to slow things down + * using the techniques below. If that fails, this routine requests + * the offending operations to fall back to running synchronously + * until the memory load returns to a reasonable level. */ -static int +int +softdep_slowdown(vp) + struct vnode *vp; +{ + int max_softdeps_hard; + + max_softdeps_hard = max_softdeps * 11 / 10; + if (num_dirrem < max_softdeps_hard / 2 && + num_inodedep < max_softdeps_hard) + return (0); + stat_sync_limit_hit += 1; + return (1); +} + +/* + * If memory utilization has gotten too high, deliberately slow things + * down and speed up the I/O processing. + */ +STATIC int request_cleanup(resource, islocked) int resource; int islocked; { struct proc *p = CURPROC; - int error; /* * We never hold up the filesystem syncer process. @@ -4166,6 +4456,29 @@ request_cleanup(resource, islocked) if (p == filesys_syncer) return (0); /* + * First check to see if the work list has gotten backlogged. + * If it has, co-opt this process to help clean up two entries. + * Because this process may hold inodes locked, we cannot + * handle any remove requests that might block on a locked + * inode as that could lead to deadlock. + */ + if (num_on_worklist > max_softdeps / 10) { + if (islocked) + FREE_LOCK(&lk); + process_worklist_item(NULL, LK_NOWAIT); + process_worklist_item(NULL, LK_NOWAIT); + stat_worklist_push += 2; + if (islocked) + ACQUIRE_LOCK(&lk); + return(1); + } + /* + * Next, we attempt to speed up the syncer process. If that + * is successful, then we allow the process to continue. + */ + if (speedup_syncer()) + return(0); + /* * If we are resource constrained on inode dependencies, try * flushing some dirty inodes. Otherwise, we are constrained * by file deletions, so try accelerating flushes of directories @@ -4179,12 +4492,14 @@ request_cleanup(resource, islocked) case FLUSH_INODES: stat_ino_limit_push += 1; - req_clear_inodedeps = 1; + req_clear_inodedeps += 1; + stat_countp = &stat_ino_limit_hit; break; case FLUSH_REMOVE: stat_blk_limit_push += 1; - req_clear_remove = 1; + req_clear_remove += 1; + stat_countp = &stat_blk_limit_hit; break; default: @@ -4196,33 +4511,43 @@ request_cleanup(resource, islocked) */ if (islocked == 0) ACQUIRE_LOCK(&lk); + proc_waiting += 1; + if (!timeout_initialized(&proc_waiting_timeout)) { + timeout_set(&proc_waiting_timeout, pause_timer, 0); + timeout_add(&proc_waiting_timeout, tickdelay > 2 ? tickdelay : 2); + } FREE_LOCK_INTERLOCKED(&lk); - error = tsleep((caddr_t)&proc_waiting, PPAUSE | PCATCH, "softupdate", - tickdelay > 2 ? tickdelay : 2); + (void) tsleep((caddr_t)&proc_waiting, PPAUSE, "softupdate", 0); ACQUIRE_LOCK_INTERLOCKED(&lk); - if (error == EWOULDBLOCK) { - switch (resource) { - - case FLUSH_INODES: - stat_ino_limit_hit += 1; - break; - - case FLUSH_REMOVE: - stat_blk_limit_hit += 1; - break; - } - } + proc_waiting -= 1; if (islocked == 0) FREE_LOCK(&lk); return (1); } /* - * Flush out a directory with at least one removal dependency in an effort - * to reduce the number of freefile and freeblks dependency structures. + * Awaken processes pausing in request_cleanup and clear proc_waiting + * to indicate that there is no longer a timer running. */ -static void +void +pause_timer(arg) + void *arg; +{ + + *stat_countp += 1; + wakeup_one(&proc_waiting); + if (proc_waiting > 0) + timeout_add(&proc_waiting_timeout, tickdelay > 2 ? tickdelay : 2); + else + timeout_del(&proc_waiting_timeout); +} + +/* + * Flush out a directory with at least one removal dependency in an effort to + * reduce the number of dirrem, freefile, and freeblks dependency structures. + */ +STATIC void clear_remove(p) struct proc *p; { @@ -4239,21 +4564,30 @@ clear_remove(p) pagedephd = &pagedep_hashtbl[next++]; if (next >= pagedep_hash) next = 0; - for (pagedep = LIST_FIRST(pagedephd); pagedep; - pagedep = LIST_NEXT(pagedep, pd_hash)) { + LIST_FOREACH(pagedep, pagedephd, pd_hash) { if (LIST_FIRST(&pagedep->pd_dirremhd) == NULL) continue; mp = pagedep->pd_mnt; ino = pagedep->pd_ino; FREE_LOCK(&lk); +#if 0 + if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) + continue; +#endif if ((error = VFS_VGET(mp, ino, &vp)) != 0) { softdep_error("clear_remove: vget", error); +#if 0 + vn_finished_write(mp); +#endif return; } if ((error = VOP_FSYNC(vp, p->p_ucred, MNT_NOWAIT, p))) softdep_error("clear_remove: fsync", error); drain_output(vp, 0); vput(vp); +#if 0 + vn_finished_write(mp); +#endif return; } } @@ -4264,7 +4598,7 @@ clear_remove(p) * Clear out a block of dirty inodes in an effort to reduce * the number of inodedep dependency structures. */ -static void +STATIC void clear_inodedeps(p) struct proc *p; { @@ -4294,8 +4628,7 @@ clear_inodedeps(p) * Ugly code to find mount point given pointer to superblock. */ fs = inodedep->id_fs; - for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist; - mp = CIRCLEQ_NEXT(mp, mnt_list)) + CIRCLEQ_FOREACH(mp, &mountlist, mnt_list) if ((mp->mnt_flag & MNT_SOFTDEP) && fs == VFSTOUFS(mp)->um_fs) break; /* @@ -4314,8 +4647,15 @@ clear_inodedeps(p) if (inodedep_lookup(fs, ino, 0, &inodedep) == 0) continue; FREE_LOCK(&lk); +#if 0 + if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) + continue; +#endif if ((error = VFS_VGET(mp, ino, &vp)) != 0) { softdep_error("clear_inodedeps: vget", error); +#if 0 + vn_finished_write(mp); +#endif return; } if (ino == lastino) { @@ -4327,17 +4667,101 @@ clear_inodedeps(p) drain_output(vp, 0); } vput(vp); +#if 0 + vn_finished_write(mp); +#endif ACQUIRE_LOCK(&lk); } FREE_LOCK(&lk); } /* + * Function to determine if the buffer has outstanding dependencies + * that will cause a roll-back if the buffer is written. If wantcount + * is set, return number of dependencies, otherwise just yes or no. + */ +int +softdep_count_dependencies(bp, wantcount) + struct buf *bp; + int wantcount; +{ + struct worklist *wk; + struct inodedep *inodedep; + struct indirdep *indirdep; + struct allocindir *aip; + struct pagedep *pagedep; + struct diradd *dap; + int i, retval; + + retval = 0; + ACQUIRE_LOCK(&lk); + LIST_FOREACH(wk, &bp->b_dep, wk_list) { + switch (wk->wk_type) { + + case D_INODEDEP: + inodedep = WK_INODEDEP(wk); + if ((inodedep->id_state & DEPCOMPLETE) == 0) { + /* bitmap allocation dependency */ + retval += 1; + if (!wantcount) + goto out; + } + if (TAILQ_FIRST(&inodedep->id_inoupdt)) { + /* direct block pointer dependency */ + retval += 1; + if (!wantcount) + goto out; + } + continue; + + case D_INDIRDEP: + indirdep = WK_INDIRDEP(wk); + + LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) { + /* indirect block pointer dependency */ + retval += 1; + if (!wantcount) + goto out; + } + continue; + + case D_PAGEDEP: + pagedep = WK_PAGEDEP(wk); + for (i = 0; i < DAHASHSZ; i++) { + + LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { + /* directory entry dependency */ + retval += 1; + if (!wantcount) + goto out; + } + } + continue; + + case D_BMSAFEMAP: + case D_ALLOCDIRECT: + case D_ALLOCINDIR: + case D_MKDIR: + /* never a dependency on these blocks */ + continue; + + default: + panic("softdep_check_for_rollback: Unexpected type %s", + TYPENAME(wk->wk_type)); + /* NOTREACHED */ + } + } +out: + FREE_LOCK(&lk); + return retval; +} + +/* * Acquire exclusive access to a buffer. * Must be called with splbio blocked. * Return 1 if buffer was acquired. */ -static int +STATIC int getdirtybuf(bpp, waitfor) struct buf **bpp; int waitfor; @@ -4367,7 +4791,7 @@ getdirtybuf(bpp, waitfor) * Wait for pending output on a vnode to complete. * Must be called with vnode locked. */ -static void +STATIC void drain_output(vp, islocked) struct vnode *vp; int islocked; diff --git a/sys/ufs/ffs/ffs_softdep_stub.c b/sys/ufs/ffs/ffs_softdep_stub.c index bd06b5fbdd2..2eabe90e9b3 100644 --- a/sys/ufs/ffs/ffs_softdep_stub.c +++ b/sys/ufs/ffs/ffs_softdep_stub.c @@ -1,10 +1,12 @@ -/* $OpenBSD: ffs_softdep_stub.c,v 1.2 1999/12/05 08:30:38 art Exp $ */ +/* $OpenBSD: ffs_softdep_stub.c,v 1.3 2001/02/21 23:24:31 csapuntz Exp $ */ /* - * Copyright 1997 Marshall Kirk McKusick. All Rights Reserved. + * Copyright 1998 Marshall Kirk McKusick. All Rights Reserved. * - * This code is derived from work done by Greg Ganger and Yale Patt at the - * University of Michigan. + * The soft updates code is derived from the appendix of a University + * of Michigan technical report (Gregory R. Ganger and Yale N. Patt, + * "Soft Updates: A Solution to the Metadata Update Problem in File + * Systems", CSE-TR-254-95, August 1995). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -14,9 +16,9 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. None of the names of McKusick, Ganger, Patt, or the University of - * Michigan may be used to endorse or promote products derived from - * this software without specific prior written permission. + * 3. None of the names of McKusick, Ganger, or the University of Michigan + * may be used to endorse or promote products derived from this software + * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE @@ -30,7 +32,8 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)ffs_softdep.stub.c 9.1 (McKusick) 7/9/97 + * from: @(#)ffs_softdep_stub.c 9.1 (McKusick) 7/10/97 + * $FreeBSD: src/sys/ufs/ffs/ffs_softdep_stub.c,v 1.14 2000/08/09 00:41:54 tegge Exp $ */ #ifndef FFS_SOFTUPDATES @@ -141,12 +144,10 @@ softdep_setup_freeblocks(ip, length) } void -softdep_freefile(ap) - struct vop_vfree_args /* { - struct vnode *a_pvp; - ino_t a_ino; - int a_mode; - } */ *ap; +softdep_freefile(pvp, ino, mode) + struct vnode *pvp; + ino_t ino; + int mode; { panic("softdep_freefile called"); @@ -200,11 +201,11 @@ softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) } void -softdep_increase_linkcnt(ip) +softdep_change_linkcnt(ip) struct inode *ip; { - panic("softdep_increase_linkcnt called"); + panic("softdep_change_linkcnt called"); } void @@ -225,13 +226,23 @@ softdep_update_inodeblock(ip, bp, waitfor) panic("softdep_update_inodeblock called"); } -int -softdep_fsync(vp) +void +softdep_fsync_mountdev(vp) struct vnode *vp; { - panic("softdep_fsync called"); - return (EIO); + return; +} + +int +softdep_flushworklist(oldmnt, countp, p) + struct mount *oldmnt; + int *countp; + struct proc *p; +{ + + *countp = 0; + return (0); } int @@ -247,11 +258,11 @@ softdep_sync_metadata(ap) return (0); } -void -softdep_fsync_mountdev(vp) +int +softdep_slowdown(vp) struct vnode *vp; { - panic("softdep_fsync_mountdev called"); + panic("softdep_slowdown called"); } diff --git a/sys/ufs/ffs/ffs_vfsops.c b/sys/ufs/ffs/ffs_vfsops.c index 2db4c26a878..e50422b6862 100644 --- a/sys/ufs/ffs/ffs_vfsops.c +++ b/sys/ufs/ffs/ffs_vfsops.c @@ -1,4 +1,4 @@ -/* $OpenBSD: ffs_vfsops.c,v 1.29 2001/02/20 01:50:12 assar Exp $ */ +/* $OpenBSD: ffs_vfsops.c,v 1.30 2001/02/21 23:24:31 csapuntz Exp $ */ /* $NetBSD: ffs_vfsops.c,v 1.19 1996/02/09 22:22:26 christos Exp $ */ /* @@ -868,7 +868,7 @@ ffs_sync(mp, waitfor, cred, p) register struct inode *ip; register struct ufsmount *ump = VFSTOUFS(mp); register struct fs *fs; - int error, allerror = 0; + int error, allerror = 0, count; fs = ump->um_fs; /* @@ -923,6 +923,13 @@ loop: /* * Force stale file system control information to be flushed. */ + if ((ump->um_mountp->mnt_flag & MNT_SOFTDEP) && waitfor == MNT_WAIT) { + if ((error == softdep_flushworklist(ump->um_mountp, &count, p)) + != 0) + allerror = error; + if (count) + goto loop; + } if (waitfor != MNT_LAZY) { if (ump->um_mountp->mnt_flag & MNT_SOFTDEP) waitfor = MNT_NOWAIT; diff --git a/sys/ufs/ffs/softdep.h b/sys/ufs/ffs/softdep.h index 6aec5127608..9b568184916 100644 --- a/sys/ufs/ffs/softdep.h +++ b/sys/ufs/ffs/softdep.h @@ -1,21 +1,17 @@ -/* $OpenBSD: softdep.h,v 1.3 2001/02/10 11:08:40 fgsch Exp $ */ +/* $OpenBSD: softdep.h,v 1.4 2001/02/21 23:24:31 csapuntz Exp $ */ /* - * Copyright 1998 Marshall Kirk McKusick. All Rights Reserved. + * Copyright 1998, 2000 Marshall Kirk McKusick. All Rights Reserved. * * The soft updates code is derived from the appendix of a University * of Michigan technical report (Gregory R. Ganger and Yale N. Patt, * "Soft Updates: A Solution to the Metadata Update Problem in File * Systems", CSE-TR-254-95, August 1995). * - * The following are the copyrights and redistribution conditions that - * apply to this copy of the soft update software. For a license - * to use, redistribute or sell the soft update software under - * conditions other than those described here, please contact the - * author at one of the following addresses: + * Further information about soft updates can be obtained from: * - * Marshall Kirk McKusick mckusick@mckusick.com - * 1614 Oxford Street +1-510-843-9542 - * Berkeley, CA 94709-1608 + * Marshall Kirk McKusick http://www.mckusick.com/softdep/ + * 1614 Oxford Street mckusick@mckusick.com + * Berkeley, CA 94709-1608 +1-510-843-9542 * USA * * Redistribution and use in source and binary forms, with or without @@ -27,19 +23,6 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. None of the names of McKusick, Ganger, Patt, or the University of - * Michigan may be used to endorse or promote products derived from - * this software without specific prior written permission. - * 4. Redistributions in any form must be accompanied by information on - * how to obtain complete source code for any accompanying software - * that uses this software. This source code must either be included - * in the distribution or be available for no more than the cost of - * distribution plus a nominal fee, and must be freely redistributable - * under reasonable conditions. For an executable file, complete - * source code means the source code for all modules it contains. - * It does not mean source code for modules or files that typically - * accompany the operating system on which the executable file runs, - * e.g., standard library modules or system header files. * * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED @@ -53,7 +36,8 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)softdep.h 9.6 (McKusick) 2/25/99 + * @(#)softdep.h 9.7 (McKusick) 6/21/00 + * $FreeBSD: src/sys/ufs/ffs/softdep.h,v 1.10 2000/06/22 00:29:53 mckusick Exp $ */ #include <sys/queue.h> @@ -353,7 +337,7 @@ struct allocdirect { struct indirdep { struct worklist ir_list; /* buffer holding indirect block */ # define ir_state ir_list.wk_state /* indirect block pointer state */ - ufs_daddr_t *ir_saveddata; /* buffer cache contents */ + caddr_t ir_saveddata; /* buffer cache contents */ struct buf *ir_savebp; /* buffer holding safe copy */ struct allocindirhd ir_donehd; /* done waiting to update safecopy */ struct allocindirhd ir_deplisthd; /* allocindir deps for this block */ @@ -399,7 +383,7 @@ struct freefrag { struct worklist ff_list; /* id_inowait or delayed worklist */ # define ff_state ff_list.wk_state /* owning user; should be uid_t */ struct vnode *ff_devvp; /* filesystem device vnode */ - struct fs *ff_fs; /* addr of superblock */ + struct mount *ff_mnt; /* associated mount point */ ufs_daddr_t ff_blkno; /* fragment physical block number */ long ff_fragsize; /* size of fragment being deleted */ ino_t ff_inum; /* owning inode number */ @@ -415,7 +399,7 @@ struct freeblks { struct worklist fb_list; /* id_inowait or delayed worklist */ ino_t fb_previousinum; /* inode of previous owner of blocks */ struct vnode *fb_devvp; /* filesystem device vnode */ - struct fs *fb_fs; /* addr of superblock */ + struct mount *fb_mnt; /* associated mount point */ off_t fb_oldsize; /* previous file size */ off_t fb_newsize; /* new file size */ int fb_chkcnt; /* used to check cnt of blks released */ @@ -435,7 +419,7 @@ struct freefile { mode_t fx_mode; /* mode of inode */ ino_t fx_oldinum; /* inum of the unlinked file */ struct vnode *fx_devvp; /* filesystem device vnode */ - struct fs *fx_fs; /* addr of superblock */ + struct mount *fx_mnt; /* associated mount point */ }; /* diff --git a/sys/ufs/ufs/inode.h b/sys/ufs/ufs/inode.h index 09f582c4e09..7d23b147966 100644 --- a/sys/ufs/ufs/inode.h +++ b/sys/ufs/ufs/inode.h @@ -1,4 +1,4 @@ -/* $OpenBSD: inode.h,v 1.11 1999/09/10 23:39:10 art Exp $ */ +/* $OpenBSD: inode.h,v 1.12 2001/02/21 23:24:31 csapuntz Exp $ */ /* $NetBSD: inode.h,v 1.8 1995/06/15 23:22:50 cgd Exp $ */ /* @@ -247,6 +247,7 @@ struct indir { #else #define DOINGSOFTDEP(vp) (0) #endif +#define DOINGASYNC(vp) ((vp)->v_mount->mnt_flag & MNT_ASYNC) /* This overlays the fid structure (see mount.h). */ struct ufid { diff --git a/sys/ufs/ufs/ufs_extern.h b/sys/ufs/ufs/ufs_extern.h index 66d5cac00c2..bdf0c2b3abf 100644 --- a/sys/ufs/ufs/ufs_extern.h +++ b/sys/ufs/ufs/ufs_extern.h @@ -1,4 +1,4 @@ -/* $OpenBSD: ufs_extern.h,v 1.9 2000/02/07 04:57:19 assar Exp $ */ +/* $OpenBSD: ufs_extern.h,v 1.10 2001/02/21 23:24:31 csapuntz Exp $ */ /* $NetBSD: ufs_extern.h,v 1.5 1996/02/09 22:36:03 christos Exp $ */ /*- @@ -180,6 +180,7 @@ void softdep_setup_remove __P((struct buf *,struct inode *, struct inode *, int)); void softdep_setup_directory_change __P((struct buf *, struct inode *, struct inode *, long, int)); -void softdep_increase_linkcnt __P((struct inode *)); +void softdep_change_linkcnt __P((struct inode *)); +int softdep_slowdown __P((struct vnode *)); __END_DECLS diff --git a/sys/ufs/ufs/ufs_lookup.c b/sys/ufs/ufs/ufs_lookup.c index 623128442d6..bc8967bd75f 100644 --- a/sys/ufs/ufs/ufs_lookup.c +++ b/sys/ufs/ufs/ufs_lookup.c @@ -1,4 +1,4 @@ -/* $OpenBSD: ufs_lookup.c,v 1.12 1999/02/26 03:35:18 art Exp $ */ +/* $OpenBSD: ufs_lookup.c,v 1.13 2001/02/21 23:24:31 csapuntz Exp $ */ /* $NetBSD: ufs_lookup.c,v 1.7 1996/02/09 22:36:06 christos Exp $ */ /* @@ -958,19 +958,31 @@ ufs_dirremove(dvp, ip, flags, isrmdir) ep->d_reclen += dp->i_reclen; } out: - if (ip) { - ip->i_effnlink--; - ip->i_flag |= IN_CHANGE; - } if (DOINGSOFTDEP(dvp)) { - if (ip) - softdep_setup_remove(bp, dp, ip, isrmdir); - bdwrite(bp); + if (ip) { + ip->i_effnlink--; + softdep_change_linkcnt(ip); + softdep_setup_remove(bp, dp, ip, isrmdir); + } + if (softdep_slowdown(dvp)) { + error = bwrite(bp); + } else { + bdwrite(bp); + error = 0; + } } else { - if (ip) - ip->i_ffs_nlink--; /* XXX */ - - error = VOP_BWRITE(bp); + if (ip) { + ip->i_effnlink--; + ip->i_ffs_nlink--; + ip->i_flag |= IN_CHANGE; + } + if (flags & DOWHITEOUT) + error = bwrite(bp); + else if (DOINGASYNC(dvp) && dp->i_count != 0) { + bdwrite(bp); + error = 0; + } else + error = bwrite(bp); } dp->i_flag |= IN_CHANGE | IN_UPDATE; return (error); @@ -1000,13 +1012,19 @@ ufs_dirrewrite(dp, oip, newinum, newtype, isrmdir) if (vdp->v_mount->mnt_maxsymlinklen > 0) ep->d_type = newtype; oip->i_effnlink--; - oip->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(vdp)) { + softdep_change_linkcnt(oip); softdep_setup_directory_change(bp, dp, oip, newinum, isrmdir); bdwrite(bp); } else { - oip->i_ffs_nlink--; /* XXX */ - error = VOP_BWRITE(bp); + oip->i_ffs_nlink--; + oip->i_flag |= IN_CHANGE; + if (DOINGASYNC(vdp)) { + bdwrite(bp); + error = 0; + } else { + error = VOP_BWRITE(bp); + } } dp->i_flag |= IN_CHANGE | IN_UPDATE; return (error); diff --git a/sys/ufs/ufs/ufs_vnops.c b/sys/ufs/ufs/ufs_vnops.c index 2a648f44883..e224d37e729 100644 --- a/sys/ufs/ufs/ufs_vnops.c +++ b/sys/ufs/ufs/ufs_vnops.c @@ -1,4 +1,4 @@ -/* $OpenBSD: ufs_vnops.c,v 1.28 2000/11/21 21:49:57 provos Exp $ */ +/* $OpenBSD: ufs_vnops.c,v 1.29 2001/02/21 23:24:32 csapuntz Exp $ */ /* $NetBSD: ufs_vnops.c,v 1.18 1996/05/11 18:28:04 mycroft Exp $ */ /* @@ -758,7 +758,7 @@ ufs_link(v) ip->i_ffs_nlink++; ip->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(vp)) - softdep_increase_linkcnt(ip); + softdep_change_linkcnt(ip); TIMEVAL_TO_TIMESPEC(&time, &ts); if ((error = VOP_UPDATE(vp, &ts, &ts, !DOINGSOFTDEP(vp))) == 0) { ufs_makedirentry(ip, cnp, &newdir); @@ -768,6 +768,8 @@ ufs_link(v) ip->i_effnlink--; ip->i_ffs_nlink--; ip->i_flag |= IN_CHANGE; + if (DOINGSOFTDEP(vp)) + softdep_change_linkcnt(ip); } FREE(cnp->cn_pnbuf, M_NAMEI); VN_KNOTE(vp, NOTE_LINK); @@ -924,9 +926,22 @@ abortit: error = EPERM; goto abortit; } + + /* + * Check if just deleting a link name or if we've lost a race. + * If another process completes the same rename after we've looked + * up the source and have blocked looking up the target, then the + * source and target inodes may be identical now although the + * names were never linked. + */ if (fvp == tvp) { if (fvp->v_type == VDIR) { - error = EINVAL; + /* + * Linked directories are impossible, so we must + * have lost the race. Pretend that the rename + * completed before the lookup. + */ + error = ENOENT; goto abortit; } @@ -935,7 +950,12 @@ abortit: vput(tdvp); vput(tvp); - /* Delete source. */ + /* + * Delete source. There is another race now that everything + * is unlocked, but this doesn't cause any new complications. + * Relookup() may find a file that is unrelated to the + * original one, or it may fail. Too bad. + */ vrele(fdvp); vrele(fvp); fcnp->cn_flags &= ~MODMASK; @@ -1012,7 +1032,7 @@ abortit: ip->i_ffs_nlink++; ip->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(fvp)) - softdep_increase_linkcnt(ip); + softdep_change_linkcnt(ip); TIMEVAL_TO_TIMESPEC(&time, &ts); if ((error = VOP_UPDATE(fvp, &ts, &ts, !DOINGSOFTDEP(fvp))) != 0) { VOP_UNLOCK(fvp, 0, p); @@ -1077,12 +1097,14 @@ abortit: dp->i_ffs_nlink++; dp->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(tdvp)) - softdep_increase_linkcnt(dp); + softdep_change_linkcnt(dp); if ((error = VOP_UPDATE(tdvp, &ts, &ts, !DOINGSOFTDEP(tdvp))) != 0) { dp->i_effnlink--; dp->i_ffs_nlink--; dp->i_flag |= IN_CHANGE; + if (DOINGSOFTDEP(tdvp)) + softdep_change_linkcnt(dp); goto bad; } } @@ -1092,6 +1114,8 @@ abortit: dp->i_effnlink--; dp->i_ffs_nlink--; dp->i_flag |= IN_CHANGE; + if (DOINGSOFTDEP(tdvp)) + softdep_change_linkcnt(dp); (void)VOP_UPDATE(tdvp, &ts, &ts, 1); } goto bad; @@ -1105,7 +1129,7 @@ abortit: * Short circuit rename(foo, foo). */ if (xp->i_number == ip->i_number) - panic("rename: same file"); + panic("ufs_rename: same file"); /* * If the parent directory is "sticky", then the user must * own the parent directory, or the destination of the rename, @@ -1146,10 +1170,12 @@ abortit: if (doingdirectory) { if (!newparent) { dp->i_effnlink--; - dp->i_flag |= IN_CHANGE; + if (DOINGSOFTDEP(tdvp)) + softdep_change_linkcnt(dp); } xp->i_effnlink--; - xp->i_flag |= IN_CHANGE; + if (DOINGSOFTDEP(tvp)) + softdep_change_linkcnt(xp); } if (doingdirectory && !DOINGSOFTDEP(tvp)) { /* @@ -1163,10 +1189,13 @@ abortit: * disk, so when running with that code we avoid doing * them now. */ - if (!newparent) + if (!newparent) { dp->i_ffs_nlink--; + dp->i_flag |= IN_CHANGE; + } xp->i_ffs_nlink--; + xp->i_flag |= IN_CHANGE; if ((error = VOP_TRUNCATE(tvp, (off_t)0, IO_SYNC, tcnp->cn_cred, tcnp->cn_proc)) != 0) goto bad; @@ -1194,7 +1223,7 @@ abortit: * From name has disappeared. */ if (doingdirectory) - panic("rename: lost dir entry"); + panic("ufs_rename: lost dir entry"); vrele(ap->a_fvp); return (0); } @@ -1209,7 +1238,7 @@ abortit: */ if (xp != ip) { if (doingdirectory) - panic("rename: lost dir entry"); + panic("ufs_rename: lost dir entry"); } else { /* * If the source is a directory with a @@ -1244,6 +1273,9 @@ out: ip->i_effnlink--; ip->i_ffs_nlink--; ip->i_flag |= IN_CHANGE; + ip->i_flag &= ~IN_RENAME; + if (DOINGSOFTDEP(fvp)) + softdep_change_linkcnt(ip); vput(fvp); } else vrele(fvp); @@ -1311,7 +1343,7 @@ ufs_mkdir(v) ip->i_effnlink = 2; ip->i_ffs_nlink = 2; if (DOINGSOFTDEP(tvp)) - softdep_increase_linkcnt(ip); + softdep_change_linkcnt(ip); if (cnp->cn_flags & ISWHITEOUT) ip->i_ffs_flags |= UF_OPAQUE; @@ -1325,7 +1357,7 @@ ufs_mkdir(v) dp->i_ffs_nlink++; dp->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(dvp)) - softdep_increase_linkcnt(dp); + softdep_change_linkcnt(dp); TIMEVAL_TO_TIMESPEC(&time, &ts); if ((error = VOP_UPDATE(dvp, &ts, &ts, !DOINGSOFTDEP(dvp))) != 0) goto bad; @@ -1395,6 +1427,8 @@ bad: dp->i_effnlink--; dp->i_ffs_nlink--; dp->i_flag |= IN_CHANGE; + if (DOINGSOFTDEP(dvp)) + softdep_change_linkcnt(dp); /* * No need to do an explicit VOP_TRUNCATE here, vrele will * do this for us because we set the link count to 0. @@ -1402,7 +1436,8 @@ bad: ip->i_effnlink = 0; ip->i_ffs_nlink = 0; ip->i_flag |= IN_CHANGE; - + if (DOINGSOFTDEP(tvp)) + softdep_change_linkcnt(ip); vput(tvp); } out: @@ -1469,28 +1504,41 @@ ufs_rmdir(v) * inode. If we crash in between, the directory * will be reattached to lost+found, */ - if ((error = ufs_dirremove(dvp, ip, cnp->cn_flags, 1)) != 0) + dp->i_effnlink--; + ip->i_effnlink--; + if (DOINGSOFTDEP(vp)) { + softdep_change_linkcnt(dp); + softdep_change_linkcnt(ip); + } + if ((error = ufs_dirremove(dvp, ip, cnp->cn_flags, 1)) != 0) { + dp->i_effnlink++; + ip->i_effnlink++; + if (DOINGSOFTDEP(vp)) { + softdep_change_linkcnt(dp); + softdep_change_linkcnt(ip); + } goto out; + } + VN_KNOTE(dvp, NOTE_WRITE | NOTE_LINK); cache_purge(dvp); /* * Truncate inode. The only stuff left in the directory is "." and * "..". The "." reference is inconsequential since we are quashing - * it. We have removed the "." reference and the reference in the - * parent directory, but there may be other hard links. The soft - * update code will arange to do these operations after the parent - * directory has been deleted on disk, so when running with - * that code we avoid doing them now. + * it. The soft dependency code will arrange to do these operations + * after the parent directory entry has been deleted on disk, so + * when running with that code we avoid doing them now. */ - dp->i_effnlink--; - dp->i_flag |= IN_CHANGE; - ip->i_effnlink--; - ip->i_flag |= IN_CHANGE; if (!DOINGSOFTDEP(vp)) { + int ioflag; + dp->i_ffs_nlink--; + dp->i_flag |= IN_CHANGE; ip->i_ffs_nlink--; - error = VOP_TRUNCATE(vp, (off_t)0, IO_SYNC, cnp->cn_cred, - cnp->cn_proc); + ip->i_flag |= IN_CHANGE; + ioflag = DOINGASYNC(vp) ? 0 : IO_SYNC; + error = VOP_TRUNCATE(vp, (off_t)0, ioflag, cnp->cn_cred, + cnp->cn_proc); } cache_purge(vp); out: @@ -2114,7 +2162,7 @@ ufs_makeinode(mode, dvp, vpp, cnp) ip->i_effnlink = 1; ip->i_ffs_nlink = 1; if (DOINGSOFTDEP(tvp)) - softdep_increase_linkcnt(ip); + softdep_change_linkcnt(ip); if ((ip->i_ffs_mode & ISGID) && !groupmember(ip->i_ffs_gid, cnp->cn_cred) && suser(cnp->cn_cred, NULL)) @@ -2150,6 +2198,8 @@ bad: ip->i_effnlink = 0; ip->i_ffs_nlink = 0; ip->i_flag |= IN_CHANGE; + if (DOINGSOFTDEP(tvp)) + softdep_change_linkcnt(ip); vput(tvp); return (error); |