diff options
author | Ted Unangst <tedu@cvs.openbsd.org> | 2004-05-07 01:40:18 +0000 |
---|---|---|
committer | Ted Unangst <tedu@cvs.openbsd.org> | 2004-05-07 01:40:18 +0000 |
commit | b5f3db6e6e90ad19a3f6c02a7ea7693f2d9945bc (patch) | |
tree | 1233cff430f643cf1b6f46e8b815bfbb86d9d6e1 /sys/ufs/ffs/ffs_softdep.c | |
parent | b1c4391b01feeddc979ddceafe505c44612180f3 (diff) |
fatty softdep merge with freebsd. fixes a variety of possible issues.
relevant changes to ffs_softdep.c were 1.95, 1.96, 1.97, 1.98, 1.103, and
1.107 in freebsd. testing marc and otto. ok deraadt
Diffstat (limited to 'sys/ufs/ffs/ffs_softdep.c')
-rw-r--r-- | sys/ufs/ffs/ffs_softdep.c | 285 |
1 files changed, 234 insertions, 51 deletions
diff --git a/sys/ufs/ffs/ffs_softdep.c b/sys/ufs/ffs/ffs_softdep.c index e21c2b96790..4d692bc24c8 100644 --- a/sys/ufs/ffs/ffs_softdep.c +++ b/sys/ufs/ffs/ffs_softdep.c @@ -1,4 +1,4 @@ -/* $OpenBSD: ffs_softdep.c,v 1.49 2004/01/20 03:44:06 tedu Exp $ */ +/* $OpenBSD: ffs_softdep.c,v 1.50 2004/05/07 01:40:16 tedu Exp $ */ /* * Copyright 1998, 2000 Marshall Kirk McKusick. All Rights Reserved. * @@ -66,25 +66,25 @@ /* * Mapping of dependency structure types to malloc types. */ -#define D_PAGEDEP 1 -#define D_INODEDEP 2 -#define D_NEWBLK 3 -#define D_BMSAFEMAP 4 -#define D_ALLOCDIRECT 5 -#define D_INDIRDEP 6 -#define D_ALLOCINDIR 7 -#define D_FREEFRAG 8 -#define D_FREEBLKS 9 -#define D_FREEFILE 10 -#define D_DIRADD 11 -#define D_MKDIR 12 -#define D_DIRREM 13 +#define D_PAGEDEP 0 +#define D_INODEDEP 1 +#define D_NEWBLK 2 +#define D_BMSAFEMAP 3 +#define D_ALLOCDIRECT 4 +#define D_INDIRDEP 5 +#define D_ALLOCINDIR 6 +#define D_FREEFRAG 7 +#define D_FREEBLKS 8 +#define D_FREEFILE 9 +#define D_DIRADD 10 +#define D_MKDIR 11 +#define D_DIRREM 12 +#define D_NEWDIRBLK 13 #define D_LAST 13 /* * Names of softdep types. */ const char *softdep_typenames[] = { - "invalid", "pagedep", "inodedep", "newblk", @@ -98,6 +98,7 @@ const char *softdep_typenames[] = { "diradd", "mkdir", "dirrem", + "newdirblk", }; #define TYPENAME(type) \ ((unsigned)(type) <= D_LAST ? softdep_typenames[type] : "???") @@ -134,6 +135,7 @@ STATIC struct dirrem *newdirrem(struct buf *, struct inode *, struct inode *, int, struct dirrem **); STATIC void free_diradd(struct diradd *); STATIC void free_allocindir(struct allocindir *, struct inodedep *); +STATIC void free_newdirblk(struct newdirblk *); STATIC int indir_trunc(struct inode *, daddr_t, int, ufs_lbn_t, long *); STATIC void deallocate_dependencies(struct buf *, struct inodedep *); @@ -372,6 +374,7 @@ STATIC struct pool freefile_pool; STATIC struct pool diradd_pool; STATIC struct pool mkdir_pool; STATIC struct pool dirrem_pool; +STATIC struct pool newdirblk_pool; static __inline void softdep_free(struct worklist *item, int type) @@ -426,6 +429,10 @@ softdep_free(struct worklist *item, int type) pool_put(&dirrem_pool, item); break; + case D_NEWDIRBLK: + pool_put(&newdirblk_pool, item); + break; + default: #ifdef DEBUG if (lk.lkt_held != -1) @@ -955,7 +962,8 @@ u_long pagedep_hash; /* size of hash table - 1 */ STATIC struct sema pagedep_in_progress; /* - * Look up a pagedep. Return 1 if found, 0 if not found. + * Look up a pagedep. Return 1 if found, 0 if not found or found + * when asked to allocate but not associated with any buffer. * If not found, allocate if DEPALLOC flag is passed. * Found or allocated entry is returned in pagedeppp. * This routine must be called with splbio interrupts blocked. @@ -986,6 +994,9 @@ top: break; if (pagedep) { *pagedeppp = pagedep; + if ((flags & DEPALLOC) != 0 && + (pagedep->pd_state & ONWORKLIST) == 0) + return (0); return (1); } if ((flags & DEPALLOC) == 0) { @@ -1198,6 +1209,8 @@ softdep_initialize() "mkdirpl", &pool_allocator_nointr); pool_init(&dirrem_pool, sizeof(struct dirrem), 0, 0, 0, "dirrempl", &pool_allocator_nointr); + pool_init(&newdirblk_pool, sizeof(struct newdirblk), 0, 0, 0, + "newdirblkpl", &pool_allocator_nointr); } /* @@ -1424,6 +1437,7 @@ softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) adp->ad_newsize = newsize; adp->ad_oldsize = oldsize; adp->ad_state = ATTACHED; + LIST_INIT(&adp->ad_newdirblk); if (newblkno == oldblkno) adp->ad_freefrag = NULL; else @@ -1520,7 +1534,9 @@ allocdirect_merge(adphead, newadp, oldadp) struct allocdirect *newadp; /* allocdirect being added */ struct allocdirect *oldadp; /* existing allocdirect being checked */ { + struct worklist *wk; struct freefrag *freefrag; + struct newdirblk *newdirblk; #ifdef DEBUG if (lk.lkt_held == -1) @@ -1530,7 +1546,7 @@ allocdirect_merge(adphead, newadp, oldadp) newadp->ad_oldsize != oldadp->ad_newsize || newadp->ad_lbn >= NDADDR) { FREE_LOCK(&lk); - panic("allocdirect_check: old %d != new %d || lbn %ld >= %d", + panic("allocdirect_merge: old %d != new %d || lbn %ld >= %d", newadp->ad_oldblkno, oldadp->ad_newblkno, newadp->ad_lbn, NDADDR); } @@ -1557,6 +1573,17 @@ allocdirect_merge(adphead, newadp, oldadp) newadp->ad_freefrag = oldadp->ad_freefrag; oldadp->ad_freefrag = freefrag; } + /* + * If we are tracking a new directory-block allocation, + * move it from the old allocdirect to the new allocdirect. + */ + if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) { + newdirblk = WK_NEWDIRBLK(wk); + WORKLIST_REMOVE(&newdirblk->db_list); + if (LIST_FIRST(&oldadp->ad_newdirblk) != NULL) + panic("allocdirect_merge: extra newdirblk"); + WORKLIST_INSERT(&newadp->ad_newdirblk, &newdirblk->db_list); + } free_allocdirect(adphead, oldadp, 0); } @@ -2047,6 +2074,21 @@ deallocate_dependencies(bp, inodedep) WORKLIST_INSERT(&inodedep->id_bufwait, &dirrem->dm_list); } + if ((pagedep->pd_state & NEWBLOCK) != 0) { + LIST_FOREACH(wk, &inodedep->id_bufwait, wk_list) + if (wk->wk_type == D_NEWDIRBLK && + WK_NEWDIRBLK(wk)->db_pagedep == + pagedep) + break; + if (wk != NULL) { + WORKLIST_REMOVE(wk); + free_newdirblk(WK_NEWDIRBLK(wk)); + } else { + FREE_LOCK(&lk); + panic("deallocate_dependencies: " + "lost pagedep"); + } + } WORKLIST_REMOVE(&pagedep->pd_list); LIST_REMOVE(pagedep, pd_hash); WORKITEM_FREE(pagedep, D_PAGEDEP); @@ -2082,6 +2124,8 @@ free_allocdirect(adphead, adp, delay) struct allocdirect *adp; int delay; { + struct newdirblk *newdirblk; + struct worklist *wk; #ifdef DEBUG if (lk.lkt_held == -1) @@ -2099,10 +2143,65 @@ free_allocdirect(adphead, adp, delay) else add_to_worklist(&adp->ad_freefrag->ff_list); } + if ((wk = LIST_FIRST(&adp->ad_newdirblk)) != NULL) { + newdirblk = WK_NEWDIRBLK(wk); + WORKLIST_REMOVE(&newdirblk->db_list); + if (LIST_FIRST(&adp->ad_newdirblk) != NULL) + panic("free_allocdirect: extra newdirblk"); + if (delay) + WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait, + &newdirblk->db_list); + else + free_newdirblk(newdirblk); + } WORKITEM_FREE(adp, D_ALLOCDIRECT); } /* + * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep. + * This routine must be called with splbio interrupts blocked. + */ +void +free_newdirblk(newdirblk) + struct newdirblk *newdirblk; +{ + struct pagedep *pagedep; + struct diradd *dap; + int i; + +#ifdef DEBUG + if (lk.lkt_held == -1) + panic("free_newdirblk: lock not held"); +#endif + /* + * If the pagedep is still linked onto the directory buffer + * dependency chain, then some of the entries on the + * pd_pendinghd list may not be committed to disk yet. In + * this case, we will simply clear the NEWBLOCK flag and + * let the pd_pendinghd list be processed when the pagedep + * is next written. If the pagedep is no longer on the buffer + * dependency chain, then all the entries on the pd_pending + * list are committed to disk and we can free them here. + */ + pagedep = newdirblk->db_pagedep; + pagedep->pd_state &= ~NEWBLOCK; + if ((pagedep->pd_state & ONWORKLIST) == 0) + while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) + free_diradd(dap); + /* + * If no dependencies remain, the pagedep will be freed. + */ + for (i = 0; i < DAHASHSZ; i++) + if (LIST_FIRST(&pagedep->pd_diraddhd[i]) != NULL) + break; + if (i == DAHASHSZ && (pagedep->pd_state & ONWORKLIST) == 0) { + LIST_REMOVE(pagedep, pd_hash); + WORKITEM_FREE(pagedep, D_PAGEDEP); + } + WORKITEM_FREE(newdirblk, D_NEWDIRBLK); +} + +/* * Prepare an inode to be freed. The actual free operation is not * done until the zero'ed inode has been written to disk. */ @@ -2413,21 +2512,25 @@ free_allocindir(aip, inodedep) * count has been incremented, but before the directory entry's * pointer to the inode has been set. */ -void -softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp) +int +softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk) struct buf *bp; /* buffer containing directory block */ struct inode *dp; /* inode for directory */ off_t diroffset; /* offset of new entry in directory */ long newinum; /* inode referenced by new directory entry */ struct buf *newdirbp; /* non-NULL => contents of new mkdir */ + int isnewblk; /* entry is in a newly allocated block */ { int offset; /* offset of new entry within directory block */ ufs_lbn_t lbn; /* block in directory containing new entry */ struct fs *fs; struct diradd *dap; + struct allocdirect *adp; struct pagedep *pagedep; struct inodedep *inodedep; + struct newdirblk *newdirblk = NULL; struct mkdir *mkdir1, *mkdir2; + /* * Whiteouts have no dependencies. @@ -2435,7 +2538,7 @@ softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp) if (newinum == WINO) { if (newdirbp != NULL) bdwrite(newdirbp); - return; + return (0); } fs = dp->i_fs; @@ -2447,6 +2550,11 @@ softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp) dap->da_offset = offset; dap->da_newinum = newinum; dap->da_state = ATTACHED; + if (isnewblk && lbn < NDADDR && fragoff(fs, diroffset) == 0) { + newdirblk = pool_get(&newdirblk_pool, PR_WAITOK); + newdirblk->db_list.wk_type = D_NEWDIRBLK; + newdirblk->db_state = 0; + } if (newdirbp == NULL) { dap->da_state |= DEPCOMPLETE; ACQUIRE_LOCK(&lk); @@ -2473,7 +2581,7 @@ softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp) * Dependency on link count increase for parent directory */ ACQUIRE_LOCK(&lk); - if (inodedep_lookup(dp->i_fs, dp->i_number, 0, &inodedep) == 0 + if (inodedep_lookup(fs, dp->i_number, 0, &inodedep) == 0 || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { dap->da_state &= ~MKDIR_PARENT; WORKITEM_FREE(mkdir2, D_MKDIR); @@ -2500,7 +2608,57 @@ softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp) diradd_inode_written(dap, inodedep); else WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); + if (isnewblk) { + /* + * Directories growing into indirect blocks are rare + * enough and the frequency of new block allocation + * in those cases even more rare, that we choose not + * to bother tracking them. Rather we simply force the + * new directory entry to disk. + */ + if (lbn >= NDADDR) { + FREE_LOCK(&lk); + /* + * We only have a new allocation when at the + * beginning of a new block, not when we are + * expanding into an existing block. + */ + if (blkoff(fs, diroffset) == 0) + return (1); + return (0); + } + /* + * We only have a new allocation when at the beginning + * of a new fragment, not when we are expanding into an + * existing fragment. Also, there is nothing to do if we + * are already tracking this block. + */ + if (fragoff(fs, diroffset) != 0) { + FREE_LOCK(&lk); + return (0); + } + + if ((pagedep->pd_state & NEWBLOCK) != 0) { + WORKITEM_FREE(newdirblk, D_NEWDIRBLK); + FREE_LOCK(&lk); + return (0); + } + /* + * Find our associated allocdirect and have it track us. + */ + if (inodedep_lookup(fs, dp->i_number, 0, &inodedep) == 0) + panic("softdep_setup_directory_add: lost inodedep"); + adp = TAILQ_LAST(&inodedep->id_newinoupdt, allocdirectlst); + if (adp == NULL || adp->ad_lbn != lbn) { + FREE_LOCK(&lk); + panic("softdep_setup_directory_add: lost entry"); + } + pagedep->pd_state |= NEWBLOCK; + newdirblk->db_pagedep = pagedep; + WORKLIST_INSERT(&adp->ad_newdirblk, &newdirblk->db_list); + } FREE_LOCK(&lk); + return (0); } /* @@ -3110,7 +3268,7 @@ softdep_disk_io_initiation(bp) /* * Replace up-to-date version with safe version. */ - MALLOC(indirdep->ir_saveddata, caddr_t, bp->b_bcount, + indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP, M_WAITOK); ACQUIRE_LOCK(&lk); indirdep->ir_state &= ~ATTACHED; @@ -3420,7 +3578,7 @@ softdep_disk_write_complete(bp) if (indirdep->ir_state & GOINGAWAY) panic("disk_write_complete: indirdep gone"); bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount); - FREE(indirdep->ir_saveddata, M_INDIRDEP); + free(indirdep->ir_saveddata, M_INDIRDEP); indirdep->ir_saveddata = 0; indirdep->ir_state &= ~UNDONE; indirdep->ir_state |= ATTACHED; @@ -3685,6 +3843,10 @@ handle_written_inodeblock(inodedep, bp) add_to_worklist(wk); continue; + case D_NEWDIRBLK: + free_newdirblk(WK_NEWDIRBLK(wk)); + continue; + default: panic("handle_written_inodeblock: Unknown type %s", TYPENAME(wk->wk_type)); @@ -3788,9 +3950,12 @@ handle_written_filepage(pagedep, bp) } /* * Free any directory additions that have been committed. + * If it is a newly allocated block, we have to wait until + * the on-disk directory inode claims the new block. */ - while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) - free_diradd(dap); + if ((pagedep->pd_state & NEWBLOCK) == 0) + while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) + free_diradd(dap); /* * Uncommitted directory entries must be restored. */ @@ -3827,23 +3992,19 @@ handle_written_filepage(pagedep, bp) if ((bp->b_flags & B_DELWRI) == 0) stat_dir_entry++; buf_dirty(bp); + return (1); } /* - * If no dependencies remain, the pagedep will be freed. - * Otherwise it will remain to update the page before it - * is written back to disk. + * If we are not waiting for a new directory block to be + * claimed by its inode, then the pagedep will be freed. + * Otherwise it will remain to track any new entries on + * the page in case they are fsync'ed. */ - if (LIST_FIRST(&pagedep->pd_pendinghd) == 0) { - for (i = 0; i < DAHASHSZ; i++) - if (LIST_FIRST(&pagedep->pd_diraddhd[i]) != NULL) - break; - if (i == DAHASHSZ) { - LIST_REMOVE(pagedep, pd_hash); - WORKITEM_FREE(pagedep, D_PAGEDEP); - return (0); - } + if ((pagedep->pd_state & NEWBLOCK) == 0) { + LIST_REMOVE(pagedep, pd_hash); + WORKITEM_FREE(pagedep, D_PAGEDEP); } - return (1); + return (0); } /* @@ -4013,6 +4174,7 @@ softdep_fsync(vp) struct mount *mnt; struct vnode *pvp; struct inode *ip; + struct inode *pip; struct buf *bp; struct fs *fs; struct proc *p = CURPROC; /* XXX */ @@ -4044,8 +4206,8 @@ softdep_fsync(vp) } dap = WK_DIRADD(wk); /* - * Flush our parent if this directory entry - * has a MKDIR_PARENT dependency. + * Flush our parent if this directory entry has a MKDIR_PARENT + * dependency or is contained in a newly allocated block. */ if (dap->da_state & DIRCHG) pagedep = dap->da_previous->dm_pagedep; @@ -4058,7 +4220,11 @@ softdep_fsync(vp) FREE_LOCK(&lk); panic("softdep_fsync: dirty"); } - flushparent = dap->da_state & MKDIR_PARENT; + if ((dap->da_state & MKDIR_PARENT) || + (pagedep->pd_state & NEWBLOCK)) + flushparent = 1; + else + flushparent = 0; /* * If we are being fsync'ed as part of vgone'ing this vnode, * then we will not be able to release and recover the @@ -4082,8 +4248,22 @@ softdep_fsync(vp) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); if (error != 0) return (error); + /* + * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps + * that are contained in direct blocks will be resolved by + * doing a UFS_UPDATE. Pagedeps contained in indirect blocks + * may require a complete sync'ing of the directory. So, we + * try the cheap and fast UFS_UPDATE first, and if that fails, + * then we do the slower VOP_FSYNC of the directory. + */ + pip = VTOI(pvp); if (flushparent) { - if ((error = UFS_UPDATE(VTOI(pvp), MNT_WAIT))) { + if ((error = UFS_UPDATE(pip, 1)) != 0) { + vput(pvp); + return (error); + } + if ((pagedep->pd_state & NEWBLOCK) && + (error = UFS_UPDATE(pip, MNT_WAIT))) { vput(pvp); return (error); } @@ -4091,7 +4271,7 @@ softdep_fsync(vp) /* * Flush directory page containing the inode's name. */ - error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), p->p_ucred, + error = bread(pvp, lbn, blksize(fs, pip, lbn), p->p_ucred, &bp); if (error == 0) error = bwrite(bp); @@ -4218,6 +4398,11 @@ softdep_sync_metadata(ap) */ waitfor = MNT_NOWAIT; top: + /* + * We must wait for any I/O in progress to finish so that + * all potential buffers on the dirty list will be visible. + */ + drain_output(vp, 1); if (getdirtybuf(&LIST_FIRST(&vp->v_dirtyblkhd), MNT_WAIT) == 0) { FREE_LOCK(&lk); return (0); @@ -4374,15 +4559,8 @@ loop: goto loop; } /* - * We must wait for any I/O in progress to finish so that - * all potential buffers on the dirty list will be visible. - * Once they are all there, proceed with the second pass - * which will wait for the I/O as per above. - */ - drain_output(vp, 1); - /* * The brief unlock is to allow any pent up dependency - * processing to be done. + * processing to be done. Then proceed with the second pass. */ if (waitfor == MNT_NOWAIT) { waitfor = MNT_WAIT; @@ -4395,7 +4573,11 @@ loop: * If we have managed to get rid of all the dirty buffers, * then we are done. For certain directories and block * devices, we may need to do further work. + * + * We must wait for any I/O in progress to finish so that + * all potential buffers on the dirty list will be visible. */ + drain_output(vp, 1); if (LIST_FIRST(&vp->v_dirtyblkhd) == NULL) { FREE_LOCK(&lk); return (0); @@ -4891,6 +5073,7 @@ clear_inodedeps(p) vn_finished_write(mp); #endif ACQUIRE_LOCK(&lk); + drain_output(vp, 1); } FREE_LOCK(&lk); } |