From 2b4c38eeb2da4295234fff1cb4e7919811fa0946 Mon Sep 17 00:00:00 2001 From: Artur Grabowski Date: Wed, 22 Apr 2009 13:12:27 +0000 Subject: Make the interactions in allocating buffers less confusing. - getnewbuf dies. instead of having getnewbuf, buf_get, buf_stub and buf_init we now have buf_get that is smaller than some of those functions were before. - Instead of allocating anonymous buffers and then freeing them if we happened to lose the race to the hash, always allocate a buffer knowing which it will belong to. - In cluster read, instead of allocating an anonymous buffer to cover the whole read and then stubs for every buffer under it, make the first buffer in the cluster cover the whole range and then shrink it in the callback. now, all buffers are always on the correct hash and we always know their identity. discussed with many, kettenis@ ok --- sys/kern/vfs_bio.c | 366 +++++++++++++++++++------------------------------- sys/kern/vfs_biomem.c | 57 +++++--- sys/sys/buf.h | 4 +- 3 files changed, 184 insertions(+), 243 deletions(-) diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index a2ed79f97f4..5be42faeac0 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -1,4 +1,4 @@ -/* $OpenBSD: vfs_bio.c,v 1.111 2009/03/23 15:10:44 beck Exp $ */ +/* $OpenBSD: vfs_bio.c,v 1.112 2009/04/22 13:12:26 art Exp $ */ /* $NetBSD: vfs_bio.c,v 1.44 1996/06/11 11:15:36 pk Exp $ */ /*- @@ -92,8 +92,6 @@ struct bio_ops bioops; */ struct pool bufpool; struct bufhead bufhead = LIST_HEAD_INITIALIZER(bufhead); -struct buf *buf_get(size_t); -struct buf *buf_stub(struct vnode *, daddr64_t); void buf_put(struct buf *); /* @@ -103,8 +101,7 @@ void buf_put(struct buf *); #define binstailfree(bp, dp) TAILQ_INSERT_TAIL(dp, bp, b_freelist) struct buf *bio_doread(struct vnode *, daddr64_t, int, int); -struct buf *getnewbuf(size_t, int, int, int *); -void buf_init(struct buf *); +struct buf *buf_get(struct vnode *, daddr64_t, size_t); void bread_cluster_callback(struct buf *); /* @@ -165,98 +162,6 @@ bremfree(struct buf *bp) bcstats.freebufs--; } -void -buf_init(struct buf *bp) -{ - splassert(IPL_BIO); - - bzero((char *)bp, sizeof *bp); - bp->b_vnbufs.le_next = NOLIST; - bp->b_freelist.tqe_next = NOLIST; - bp->b_synctime = time_uptime + 300; - bp->b_dev = NODEV; - LIST_INIT(&bp->b_dep); -} - -/* - * This is a non-sleeping expanded equivalent of getblk() that allocates only - * the buffer structure, and not its contents. - */ -struct buf * -buf_stub(struct vnode *vp, daddr64_t lblkno) -{ - struct buf *bp; - int s; - - s = splbio(); - bp = pool_get(&bufpool, PR_NOWAIT); - splx(s); - - if (bp == NULL) - return (NULL); - - bzero((char *)bp, sizeof *bp); - bp->b_vnbufs.le_next = NOLIST; - bp->b_freelist.tqe_next = NOLIST; - bp->b_synctime = time_uptime + 300; - bp->b_dev = NODEV; - bp->b_bufsize = 0; - bp->b_data = NULL; - bp->b_flags = 0; - bp->b_dev = NODEV; - bp->b_blkno = bp->b_lblkno = lblkno; - bp->b_iodone = NULL; - bp->b_error = 0; - bp->b_resid = 0; - bp->b_bcount = 0; - bp->b_dirtyoff = bp->b_dirtyend = 0; - bp->b_validoff = bp->b_validend = 0; - - LIST_INIT(&bp->b_dep); - - buf_acquire_unmapped(bp); - - s = splbio(); - LIST_INSERT_HEAD(&bufhead, bp, b_list); - bcstats.numbufs++; - bgetvp(vp, bp); - splx(s); - - return (bp); -} - -struct buf * -buf_get(size_t size) -{ - struct buf *bp; - int npages; - - splassert(IPL_BIO); - - KASSERT(size > 0); - - size = round_page(size); - npages = atop(size); - - if (bcstats.numbufpages + npages > bufpages) - return (NULL); - - bp = pool_get(&bufpool, PR_WAITOK); - - buf_init(bp); - bp->b_flags = B_INVAL; - buf_alloc_pages(bp, size); - bp->b_data = NULL; - binsheadfree(bp, &bufqueues[BQ_CLEAN]); - binshash(bp, &invalhash); - LIST_INSERT_HEAD(&bufhead, bp, b_list); - bcstats.numbufs++; - bcstats.freebufs++; - bcstats.numcleanpages += atop(bp->b_bufsize); - - return (bp); -} - void buf_put(struct buf *bp) { @@ -277,6 +182,7 @@ buf_put(struct buf *bp) panic("buf_put: b_dep is not empty"); #endif + bremhash(bp); LIST_REMOVE(bp, b_list); bcstats.numbufs--; @@ -438,20 +344,33 @@ breadn(struct vnode *vp, daddr64_t blkno, int size, daddr64_t rablks[], void bread_cluster_callback(struct buf *bp) { + struct buf **xbpp = bp->b_saveaddr; int i; - struct buf **xbpp; - xbpp = (struct buf **)bp->b_saveaddr; + if (xbpp[1] != NULL) { + size_t newsize = xbpp[1]->b_bufsize; - for (i = 0; xbpp[i] != 0; i++) { + /* + * Shrink this buffer to only cover its part of the total I/O. + */ + buf_shrink_mem(bp, newsize); + bp->b_bcount = newsize; + } + + for (i = 1; xbpp[i] != 0; i++) { if (ISSET(bp->b_flags, B_ERROR)) SET(xbpp[i]->b_flags, B_INVAL | B_ERROR); biodone(xbpp[i]); } free(xbpp, M_TEMP); - bp->b_pobj = NULL; - buf_put(bp); + + if (ISSET(bp->b_flags, B_ASYNC)) { + brelse(bp); + } else { + CLR(bp->b_flags, B_WANTED); + wakeup(bp); + } } int @@ -464,14 +383,14 @@ bread_cluster(struct vnode *vp, daddr64_t blkno, int size, struct buf **rbpp) *rbpp = bio_doread(vp, blkno, size, 0); if (size != round_page(size)) - return (biowait(*rbpp)); + goto out; if (VOP_BMAP(vp, blkno + 1, NULL, &sblkno, &maxra)) - return (biowait(*rbpp)); + goto out; maxra++; if (sblkno == -1 || maxra < 2) - return (biowait(*rbpp)); + goto out; howmany = MAXPHYS / size; if (howmany > maxra) @@ -479,66 +398,60 @@ bread_cluster(struct vnode *vp, daddr64_t blkno, int size, struct buf **rbpp) xbpp = malloc((howmany + 1) * sizeof(struct buf *), M_TEMP, M_NOWAIT); if (xbpp == NULL) - return (biowait(*rbpp)); + goto out; - for (i = 0; i < howmany; i++) { - if (incore(vp, blkno + i + 1)) { - for (--i; i >= 0; i--) { - SET(xbpp[i]->b_flags, B_INVAL); - brelse(xbpp[i]); - } - free(xbpp, M_TEMP); - return (biowait(*rbpp)); - } - xbpp[i] = buf_stub(vp, blkno + i + 1); + for (i = howmany - 1; i >= 0; i--) { + size_t sz; + + /* + * First buffer allocates big enough size to cover what + * all the other buffers need. + */ + sz = i == 0 ? howmany * size : 0; + + xbpp[i] = buf_get(vp, blkno + i + 1, sz); if (xbpp[i] == NULL) { - for (--i; i >= 0; i--) { + for (++i; i < howmany; i++) { SET(xbpp[i]->b_flags, B_INVAL); brelse(xbpp[i]); } free(xbpp, M_TEMP); - return (biowait(*rbpp)); + goto out; } } - xbpp[howmany] = 0; + bp = xbpp[0]; - bp = getnewbuf(howmany * size, 0, 0, NULL); - if (bp == NULL) { - for (i = 0; i < howmany; i++) { - SET(xbpp[i]->b_flags, B_INVAL); - brelse(xbpp[i]); - } - free(xbpp, M_TEMP); - return (biowait(*rbpp)); - } + xbpp[howmany] = 0; inc = btodb(size); - for (i = 0; i < howmany; i++) { + for (i = 1; i < howmany; i++) { bcstats.pendingreads++; bcstats.numreads++; SET(xbpp[i]->b_flags, B_READ | B_ASYNC); - binshash(xbpp[i], BUFHASH(vp, xbpp[i]->b_lblkno)); xbpp[i]->b_blkno = sblkno + (i * inc); xbpp[i]->b_bufsize = xbpp[i]->b_bcount = size; xbpp[i]->b_data = NULL; xbpp[i]->b_pobj = bp->b_pobj; xbpp[i]->b_poffs = bp->b_poffs + (i * size); - buf_acquire_unmapped(xbpp[i]); } + KASSERT(bp->b_lblkno == blkno + 1); + KASSERT(bp->b_vp == vp); + bp->b_blkno = sblkno; - bp->b_lblkno = blkno + 1; SET(bp->b_flags, B_READ | B_ASYNC | B_CALL); + bp->b_saveaddr = (void *)xbpp; bp->b_iodone = bread_cluster_callback; - bp->b_vp = vp; + bcstats.pendingreads++; bcstats.numreads++; VOP_STRATEGY(bp); curproc->p_stats->p_ru.ru_inblock++; +out: return (biowait(*rbpp)); } @@ -738,7 +651,6 @@ brelse(struct buf *bp) struct bqueues *bufq; int s; - /* Block disk interrupts. */ s = splbio(); if (bp->b_data != NULL) @@ -766,6 +678,8 @@ brelse(struct buf *bp) if (bp->b_vp) brelvp(bp); + bremhash(bp); + binshash(bp, &invalhash); /* * If the buffer has no associated data, place it back in the @@ -824,7 +738,7 @@ brelse(struct buf *bp) /* Wake up any processes waiting for any buffer to become free. */ if (needbuffer) { needbuffer--; - wakeup_one(&needbuffer); + wakeup(&needbuffer); } /* Wake up any processes waiting for _this_ buffer to become free. */ @@ -866,8 +780,7 @@ incore(struct vnode *vp, daddr64_t blkno) struct buf * getblk(struct vnode *vp, daddr64_t blkno, int size, int slpflag, int slptimeo) { - struct bufhashhdr *bh; - struct buf *bp, *nb = NULL; + struct buf *bp; int s, error; /* @@ -880,7 +793,6 @@ getblk(struct vnode *vp, daddr64_t blkno, int size, int slpflag, int slptimeo) * case, we can't allow the system to allocate a new buffer for * the block until the write is finished. */ - bh = BUFHASH(vp, blkno); start: LIST_FOREACH(bp, BUFHASH(vp, blkno), b_hash) { if (bp->b_lblkno != blkno || bp->b_vp != vp) @@ -888,12 +800,6 @@ start: s = splbio(); if (ISSET(bp->b_flags, B_BUSY)) { - if (nb != NULL) { - SET(nb->b_flags, B_INVAL); - binshash(nb, &invalhash); - brelse(nb); - nb = NULL; - } SET(bp->b_flags, B_WANTED); error = tsleep(bp, slpflag | (PRIBIO + 1), "getblk", slptimeo); @@ -909,36 +815,14 @@ start: SET(bp->b_flags, B_CACHE); buf_acquire(bp); splx(s); - break; + return (bp); } splx(s); } - if (nb && bp) { - SET(nb->b_flags, B_INVAL); - binshash(nb, &invalhash); - brelse(nb); - nb = NULL; - } - if (bp == NULL && nb == NULL) { - nb = getnewbuf(size, slpflag, slptimeo, &error); - if (nb == NULL) { - if (error == ERESTART || error == EINTR) - return (NULL); - } + + if ((bp = buf_get(vp, blkno, size)) == NULL) goto start; - } - if (nb) { - bp = nb; - binshash(bp, bh); - bp->b_blkno = bp->b_lblkno = blkno; - s = splbio(); - bgetvp(vp, bp); - splx(s); - } -#ifdef DIAGNOSTIC - if (!ISSET(bp->b_flags, B_BUSY)) - panic("getblk buffer not B_BUSY"); -#endif + return (bp); } @@ -950,86 +834,118 @@ geteblk(int size) { struct buf *bp; - while ((bp = getnewbuf(size, 0, 0, NULL)) == NULL) + while ((bp = buf_get(NULL, 0, size)) == NULL) ; - SET(bp->b_flags, B_INVAL); - binshash(bp, &invalhash); return (bp); } /* - * Find a buffer which is available for use. + * Allocate a buffer. */ struct buf * -getnewbuf(size_t size, int slpflag, int slptimeo, int *ep) +buf_get(struct vnode *vp, daddr64_t blkno, size_t size) { struct buf *bp; + int poolwait = size == 0 ? PR_NOWAIT : PR_WAITOK; + int npages; int s; -#if 0 /* we would really like this but sblock update kills it */ - KASSERT(curproc != syncerproc && curproc != cleanerproc); -#endif - s = splbio(); - /* - * Wake up cleaner if we're getting low on pages. - */ - if (bcstats.numdirtypages >= hidirtypages || bcstats.numcleanpages <= locleanpages) - wakeup(&bd_req); + if (size) { + /* + * Wake up cleaner if we're getting low on pages. + */ + if (bcstats.numdirtypages >= hidirtypages || + bcstats.numcleanpages <= locleanpages) + wakeup(&bd_req); - /* - * If we're above the high water mark for clean pages, - * free down to the low water mark. - */ - if (bcstats.numcleanpages > hicleanpages) { - while (bcstats.numcleanpages > locleanpages) { - bp = TAILQ_FIRST(&bufqueues[BQ_CLEAN]); - bremfree(bp); - if (bp->b_vp) - brelvp(bp); - bremhash(bp); - buf_put(bp); + /* + * If we're above the high water mark for clean pages, + * free down to the low water mark. + */ + if (bcstats.numcleanpages > hicleanpages) { + while (bcstats.numcleanpages > locleanpages) { + bp = TAILQ_FIRST(&bufqueues[BQ_CLEAN]); + bremfree(bp); + if (bp->b_vp) + brelvp(bp); + buf_put(bp); + } + } + + npages = atop(round_page(size)); + + /* + * Free some buffers until we have enough space. + */ + while (bcstats.numbufpages + npages > bufpages) { + int freemax = 5; + int i = freemax; + while ((bp = TAILQ_FIRST(&bufqueues[BQ_CLEAN])) && i--) { + bremfree(bp); + if (bp->b_vp) + brelvp(bp); + buf_put(bp); + } + if (freemax == i) { + needbuffer++; + tsleep(&needbuffer, PRIBIO, "needbuffer", 0); + splx(s); + return (NULL); + } } } - /* we just ask. it can say no.. */ -getsome: - bp = buf_get(size); + bp = pool_get(&bufpool, poolwait|PR_ZERO); + if (bp == NULL) { - int freemax = 5; - int i = freemax; - while ((bp = TAILQ_FIRST(&bufqueues[BQ_CLEAN])) && i--) { - bremfree(bp); - if (bp->b_vp) - brelvp(bp); - bremhash(bp); - buf_put(bp); - } - if (freemax != i) - goto getsome; splx(s); return (NULL); } - bremfree(bp); - /* Buffer is no longer on free lists. */ - bp->b_flags = 0; - buf_acquire(bp); - - splx(s); - - /* clear out various other fields */ + bp->b_freelist.tqe_next = NOLIST; + bp->b_synctime = time_uptime + 300; bp->b_dev = NODEV; - bp->b_blkno = bp->b_lblkno = 0; - bp->b_iodone = NULL; - bp->b_error = 0; - bp->b_resid = 0; + LIST_INIT(&bp->b_dep); bp->b_bcount = size; - bp->b_dirtyoff = bp->b_dirtyend = 0; - bp->b_validoff = bp->b_validend = 0; - bremhash(bp); + buf_acquire_unmapped(bp); + + if (vp != NULL) { + /* + * We insert the buffer into the hash with B_BUSY set + * while we allocate pages for it. This way any getblk + * that happens while we allocate pages will wait for + * this buffer instead of starting its own guf_get. + * + * But first, we check if someone beat us to it. + */ + if (incore(vp, blkno)) { + pool_put(&bufpool, bp); + splx(s); + return (NULL); + } + + bp->b_blkno = bp->b_lblkno = blkno; + bgetvp(vp, bp); + binshash(bp, BUFHASH(vp, blkno)); + } else { + bp->b_vnbufs.le_next = NOLIST; + SET(bp->b_flags, B_INVAL); + binshash(bp, &invalhash); + } + + LIST_INSERT_HEAD(&bufhead, bp, b_list); + bcstats.numbufs++; + + if (size) { + buf_alloc_pages(bp, round_page(size)); + buf_map(bp); + } + + splx(s); + return (bp); } diff --git a/sys/kern/vfs_biomem.c b/sys/kern/vfs_biomem.c index 671921dbc4c..708c06f4d61 100644 --- a/sys/kern/vfs_biomem.c +++ b/sys/kern/vfs_biomem.c @@ -1,4 +1,4 @@ -/* $OpenBSD: vfs_biomem.c,v 1.4 2008/11/08 23:20:50 pedro Exp $ */ +/* $OpenBSD: vfs_biomem.c,v 1.5 2009/04/22 13:12:26 art Exp $ */ /* * Copyright (c) 2007 Artur Grabowski * @@ -75,7 +75,6 @@ buf_mem_init(vsize_t size) void buf_acquire(struct buf *bp) { - vaddr_t va; int s; KASSERT((bp->b_flags & B_BUSY) == 0); @@ -85,6 +84,32 @@ buf_acquire(struct buf *bp) * Busy before waiting for kvm. */ SET(bp->b_flags, B_BUSY); + buf_map(bp); + + splx(s); +} + +/* + * Busy a buffer, but don't map it. + * If it has a mapping, we keep it, but we also keep the mapping on + * the list since we assume that it won't be used anymore. + */ +void +buf_acquire_unmapped(struct buf *bp) +{ + int s; + + s = splbio(); + SET(bp->b_flags, B_BUSY|B_NOTMAPPED); + splx(s); +} + +void +buf_map(struct buf *bp) +{ + vaddr_t va; + + splassert(IPL_BIO); if (bp->b_data == NULL) { unsigned long i; @@ -123,22 +148,8 @@ buf_acquire(struct buf *bp) } else { TAILQ_REMOVE(&buf_valist, bp, b_valist); } - splx(s); -} -/* - * Busy a buffer, but don't map it. - * If it has a mapping, we keep it, but we also keep the mapping on - * the list since we assume that it won't be used anymore. - */ -void -buf_acquire_unmapped(struct buf *bp) -{ - int s; - - s = splbio(); - SET(bp->b_flags, B_BUSY|B_NOTMAPPED); - splx(s); + CLR(bp->b_flags, B_NOTMAPPED); } void @@ -209,6 +220,18 @@ buf_dealloc_mem(struct buf *bp) return (1); } +void +buf_shrink_mem(struct buf *bp, vsize_t newsize) +{ + vaddr_t va = (vaddr_t)bp->b_data; + + if (newsize < bp->b_bufsize) { + pmap_kremove(va + newsize, bp->b_bufsize - newsize); + pmap_update(pmap_kernel()); + bp->b_bufsize = newsize; + } +} + vaddr_t buf_unmap(struct buf *bp) { diff --git a/sys/sys/buf.h b/sys/sys/buf.h index 330cc90d614..c3898a15616 100644 --- a/sys/sys/buf.h +++ b/sys/sys/buf.h @@ -1,4 +1,4 @@ -/* $OpenBSD: buf.h,v 1.60 2008/06/12 06:58:40 deraadt Exp $ */ +/* $OpenBSD: buf.h,v 1.61 2009/04/22 13:12:26 art Exp $ */ /* $NetBSD: buf.h,v 1.25 1997/04/09 21:12:17 mycroft Exp $ */ /* @@ -247,8 +247,10 @@ struct buf *incore(struct vnode *, daddr64_t); void buf_mem_init(vsize_t); void buf_acquire(struct buf *); void buf_acquire_unmapped(struct buf *); +void buf_map(struct buf *); void buf_release(struct buf *); int buf_dealloc_mem(struct buf *); +void buf_shrink_mem(struct buf *, vsize_t); void buf_alloc_pages(struct buf *, vsize_t); void buf_free_pages(struct buf *); -- cgit v1.2.3