diff options
author | Bob Beck <beck@cvs.openbsd.org> | 2008-06-10 20:14:38 +0000 |
---|---|---|
committer | Bob Beck <beck@cvs.openbsd.org> | 2008-06-10 20:14:38 +0000 |
commit | b24b61b32a9fedc5de53427f16b7eb72e27e858d (patch) | |
tree | 7fb0535d861bd759c5fd3ec6d937d5aa85b4aeda | |
parent | cb3bb3cecc1d0474c3e63d1cdf36ba8c39fdf873 (diff) |
Buffer cache revamp
1) remove multiple size queues, introduced as a stopgap.
2) decouple pages containing data from their mappings
3) only keep buffers mapped when they actually have to be mapped
(right now, this is when buffers are B_BUSY)
4) New functions to make a buffer busy, and release the busy flag
(buf_acquire and buf_release)
5) Move high/low water marks and statistics counters into a structure
6) Add a sysctl to retrieve buffer cache statistics
Tested in several variants and beat upon by bob and art for a year. run
accidentally on henning's nfs server for a few months...
ok deraadt@, krw@, art@ - who promises to be around to deal with any fallout
-rw-r--r-- | sys/conf/files | 3 | ||||
-rw-r--r-- | sys/isofs/udf/udf_subr.c | 6 | ||||
-rw-r--r-- | sys/kern/kern_sched.c | 4 | ||||
-rw-r--r-- | sys/kern/spec_vnops.c | 4 | ||||
-rw-r--r-- | sys/kern/subr_disk.c | 4 | ||||
-rw-r--r-- | sys/kern/vfs_bio.c | 418 | ||||
-rw-r--r-- | sys/kern/vfs_biomem.c | 305 | ||||
-rw-r--r-- | sys/kern/vfs_subr.c | 16 | ||||
-rw-r--r-- | sys/miscfs/specfs/spec_vnops.c | 4 | ||||
-rw-r--r-- | sys/nfs/nfs_syscalls.c | 5 | ||||
-rw-r--r-- | sys/nfs/nfs_vnops.c | 17 | ||||
-rw-r--r-- | sys/scsi/cd.c | 4 | ||||
-rw-r--r-- | sys/sys/buf.h | 27 | ||||
-rw-r--r-- | sys/sys/mount.h | 25 | ||||
-rw-r--r-- | sys/ufs/ext2fs/ext2fs_bmap.c | 3 | ||||
-rw-r--r-- | sys/ufs/ext2fs/ext2fs_inode.c | 4 | ||||
-rw-r--r-- | sys/ufs/ffs/ffs_inode.c | 4 | ||||
-rw-r--r-- | sys/ufs/ffs/ffs_softdep.c | 8 | ||||
-rw-r--r-- | sys/ufs/ffs/ffs_vnops.c | 5 | ||||
-rw-r--r-- | sys/ufs/ufs/ufs_bmap.c | 4 | ||||
-rw-r--r-- | sys/ufs/ufs/ufs_dirhash.c | 3 |
21 files changed, 572 insertions, 301 deletions
diff --git a/sys/conf/files b/sys/conf/files index 958ab8fd5fd..ce464781f75 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -1,4 +1,4 @@ -# $OpenBSD: files,v 1.432 2008/06/09 22:47:41 djm Exp $ +# $OpenBSD: files,v 1.433 2008/06/10 20:14:36 beck Exp $ # $NetBSD: files,v 1.87 1996/05/19 17:17:50 jonathan Exp $ # @(#)files.newconf 7.5 (Berkeley) 5/10/93 @@ -707,6 +707,7 @@ file kern/uipc_socket2.c file kern/uipc_syscalls.c file kern/uipc_usrreq.c file kern/vfs_bio.c +file kern/vfs_biomem.c file kern/vfs_cache.c file kern/vfs_cluster.c file kern/vfs_conf.c diff --git a/sys/isofs/udf/udf_subr.c b/sys/isofs/udf/udf_subr.c index 1c24b583be7..b81da604b1f 100644 --- a/sys/isofs/udf/udf_subr.c +++ b/sys/isofs/udf/udf_subr.c @@ -1,4 +1,4 @@ -/* $OpenBSD: udf_subr.c,v 1.13 2007/06/08 05:35:32 deraadt Exp $ */ +/* $OpenBSD: udf_subr.c,v 1.14 2008/06/10 20:14:36 beck Exp $ */ /* * Copyright (c) 2006, Miodrag Vallat @@ -110,7 +110,7 @@ udf_disklabelspoof(dev_t dev, void (*strat)(struct buf *), */ bp->b_blkno = sector * btodb(bsize); bp->b_bcount = bsize; - bp->b_flags = B_BUSY | B_READ; + bp->b_flags |= B_READ; bp->b_resid = bp->b_blkno / lp->d_secpercyl; (*strat)(bp); @@ -130,7 +130,7 @@ udf_disklabelspoof(dev_t dev, void (*strat)(struct buf *), for (sector = mvds_start; sector < mvds_end; sector++) { bp->b_blkno = sector * btodb(bsize); bp->b_bcount = bsize; - bp->b_flags = B_BUSY | B_READ; + bp->b_flags |= B_READ; bp->b_resid = bp->b_blkno / lp->d_secpercyl; (*strat)(bp); diff --git a/sys/kern/kern_sched.c b/sys/kern/kern_sched.c index 3357da7c492..0759e01f2b8 100644 --- a/sys/kern/kern_sched.c +++ b/sys/kern/kern_sched.c @@ -1,4 +1,4 @@ -/* $OpenBSD: kern_sched.c,v 1.3 2008/06/08 20:13:13 thib Exp $ */ +/* $OpenBSD: kern_sched.c,v 1.4 2008/06/10 20:14:36 beck Exp $ */ /* * Copyright (c) 2007 Artur Grabowski <art@openbsd.org> * @@ -113,6 +113,8 @@ sched_idle(void *v) } } + splassert(IPL_NONE); + cpu_idle_enter(); while (sched_is_idle()) cpu_idle_cycle(); diff --git a/sys/kern/spec_vnops.c b/sys/kern/spec_vnops.c index e175105abc1..3af4f6c1a10 100644 --- a/sys/kern/spec_vnops.c +++ b/sys/kern/spec_vnops.c @@ -1,4 +1,4 @@ -/* $OpenBSD: spec_vnops.c,v 1.49 2008/05/08 17:45:45 thib Exp $ */ +/* $OpenBSD: spec_vnops.c,v 1.50 2008/06/10 20:14:36 beck Exp $ */ /* $NetBSD: spec_vnops.c,v 1.29 1996/04/22 01:42:38 christos Exp $ */ /* @@ -447,7 +447,7 @@ loop: if ((bp->b_flags & B_DELWRI) == 0) panic("spec_fsync: not dirty"); bremfree(bp); - bp->b_flags |= B_BUSY; + buf_acquire(bp); splx(s); bawrite(bp); goto loop; diff --git a/sys/kern/subr_disk.c b/sys/kern/subr_disk.c index 92a267e9c1d..fe111450c96 100644 --- a/sys/kern/subr_disk.c +++ b/sys/kern/subr_disk.c @@ -1,4 +1,4 @@ -/* $OpenBSD: subr_disk.c,v 1.74 2008/05/23 00:51:33 krw Exp $ */ +/* $OpenBSD: subr_disk.c,v 1.75 2008/06/10 20:14:36 beck Exp $ */ /* $NetBSD: subr_disk.c,v 1.17 1996/03/16 23:17:08 christos Exp $ */ /* @@ -399,7 +399,7 @@ readdoslabel(struct buf *bp, void (*strat)(struct buf *), /* read boot record */ bp->b_blkno = part_blkno; bp->b_bcount = lp->d_secsize; - bp->b_flags = B_BUSY | B_READ; + bp->b_flags = B_BUSY | B_READ | B_RAW; (*strat)(bp); if (biowait(bp)) { /*wrong*/ if (partoffp) diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index 4d11266537c..f44253ef163 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -1,4 +1,4 @@ -/* $OpenBSD: vfs_bio.c,v 1.103 2008/03/16 19:42:57 otto Exp $ */ +/* $OpenBSD: vfs_bio.c,v 1.104 2008/06/10 20:14:36 beck Exp $ */ /* $NetBSD: vfs_bio.c,v 1.44 1996/06/11 11:15:36 pk Exp $ */ /*- @@ -78,14 +78,12 @@ u_long bufhash; /* * Definitions for the buffer free lists. */ -#define BQUEUES 6 /* number of free buffer queues */ +#define BQUEUES 2 /* number of free buffer queues */ #define BQ_DIRTY 0 /* LRU queue with dirty buffers */ - +#define BQ_CLEAN 1 /* LRU queue with clean buffers */ TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES]; -int bqpages[BQUEUES]; /* pages allocated, per queue */ -int bqpagelow; int needbuffer; struct bio_ops bioops; @@ -93,7 +91,6 @@ struct bio_ops bioops; * Buffer pool for I/O buffers. */ struct pool bufpool; -struct vm_map *buf_map; struct bufhead bufhead = LIST_HEAD_INITIALIZER(bufhead); struct buf *buf_get(size_t); struct buf *buf_stub(struct vnode *, daddr64_t); @@ -107,7 +104,7 @@ void buf_put(struct buf *); struct buf *bio_doread(struct vnode *, daddr64_t, int, int); struct buf *getnewbuf(size_t, int, int, int *); -void buf_init(struct buf *, int); +void buf_init(struct buf *); void bread_cluster_callback(struct buf *); /* @@ -117,54 +114,33 @@ void bread_cluster_callback(struct buf *); * numdirtypages - number of pages on BQ_DIRTY queue. * lodirtypages - low water mark for buffer cleaning daemon. * hidirtypages - high water mark for buffer cleaning daemon. - * numfreepages - number of pages on BQ_CLEAN and BQ_DIRTY queues. unused. * numcleanpages - number of pages on BQ_CLEAN queue. * Used to track the need to speedup the cleaner and * as a reserve for special processes like syncer. * maxcleanpages - the highest page count on BQ_CLEAN. */ -long numbufpages; -long numdirtypages; + +struct bcachestats bcstats; long lodirtypages; long hidirtypages; -long numfreepages; -long numcleanpages; long locleanpages; long hicleanpages; long maxcleanpages; -struct proc *cleanerproc; -int bd_req; /* Sleep point for cleaner daemon. */ +/* XXX - should be defined here. */ +extern int bufcachepercent; -int size2cqueue(int *size); +vsize_t bufkvm; -int -size2cqueue(int *size) -{ - int i = 0, q; - int s = *size; - s -= 1; - while (s > 0) { - s = s >> 1; - i++; - } - if (i < PAGE_SHIFT) { - i = PAGE_SHIFT; /* < 4096 -> 4096 */ - } - *size = 1 << i; - q = (i + 1 - PAGE_SHIFT); /* XXX 4096 is queue 1 */ - if (q >= BQUEUES) - panic("queue %d > BQUEUES %d", q, BQUEUES); - if (q == 0) - panic("can't return dirty q"); - return(q); -} +struct proc *cleanerproc; +int bd_req; /* Sleep point for cleaner daemon. */ void bremfree(struct buf *bp) { struct bqueues *dp = NULL; - int queue; + + splassert(IPL_BIO); /* * We only calculate the head of the freelist when removing @@ -180,38 +156,26 @@ bremfree(struct buf *bp) if (dp == &bufqueues[BQUEUES]) panic("bremfree: lost tail"); } - numfreepages -= atop(bp->b_bufsize); if (!ISSET(bp->b_flags, B_DELWRI)) { - int qs = bp->b_bufsize; - queue = size2cqueue(&qs); - numcleanpages -= atop(bp->b_bufsize); - bqpages[queue] -= atop(bp->b_bufsize); - } else - numdirtypages -= atop(bp->b_bufsize); + bcstats.numcleanpages -= atop(bp->b_bufsize); + } else { + bcstats.numdirtypages -= atop(bp->b_bufsize); + } TAILQ_REMOVE(dp, bp, b_freelist); + bcstats.freebufs--; } void -buf_init(struct buf *bp, int size) +buf_init(struct buf *bp) { - int npages, queue; - splassert(IPL_BIO); - npages = atop(size); bzero((char *)bp, sizeof *bp); bp->b_vnbufs.le_next = NOLIST; bp->b_freelist.tqe_next = NOLIST; bp->b_synctime = time_uptime + 300; bp->b_dev = NODEV; - queue = size2cqueue(&size); LIST_INIT(&bp->b_dep); - numbufpages += npages; - numfreepages += npages; - numcleanpages += npages; - bqpages[queue] += npages; - if (maxcleanpages < numcleanpages) - maxcleanpages = numcleanpages; } /* @@ -238,7 +202,7 @@ buf_stub(struct vnode *vp, daddr64_t lblkno) bp->b_dev = NODEV; bp->b_bufsize = 0; bp->b_data = NULL; - bp->b_flags = B_BUSY; + bp->b_flags = 0; bp->b_dev = NODEV; bp->b_blkno = bp->b_lblkno = lblkno; bp->b_iodone = NULL; @@ -250,8 +214,11 @@ buf_stub(struct vnode *vp, daddr64_t lblkno) LIST_INIT(&bp->b_dep); + buf_acquire_unmapped(bp); + s = splbio(); LIST_INSERT_HEAD(&bufhead, bp, b_list); + bcstats.numbufs++; bgetvp(vp, bp); splx(s); @@ -261,39 +228,31 @@ buf_stub(struct vnode *vp, daddr64_t lblkno) struct buf * buf_get(size_t size) { - struct bqueues *dp; struct buf *bp; int npages; - int queue, qs; - void *data; splassert(IPL_BIO); KASSERT(size > 0); size = round_page(size); - qs = size; - queue = size2cqueue(&qs); - npages = atop(qs); + npages = atop(size); - if (numbufpages + npages > bufpages) + if (bcstats.numbufpages + npages > bufpages) return (NULL); bp = pool_get(&bufpool, PR_WAITOK); - data = (void *)uvm_km_alloc(buf_map, qs); - if (data == NULL) { - pool_put(&bufpool, bp); - return (NULL); - } - buf_init(bp, qs); + buf_init(bp); bp->b_flags = B_INVAL; - bp->b_bufsize = qs; - bp->b_data = data; - dp = &bufqueues[queue]; - binsheadfree(bp, dp); + buf_alloc_pages(bp, size); + bp->b_data = NULL; + binsheadfree(bp, &bufqueues[BQ_CLEAN]); binshash(bp, &invalhash); LIST_INSERT_HEAD(&bufhead, bp, b_list); + bcstats.numbufs++; + bcstats.freebufs++; + bcstats.numcleanpages += atop(bp->b_bufsize); return (bp); } @@ -303,7 +262,7 @@ buf_put(struct buf *bp) { splassert(IPL_BIO); #ifdef DIAGNOSTIC - if (bp->b_data != NULL) + if (bp->b_pobj != NULL) KASSERT(bp->b_bufsize > 0); #endif #ifdef DIAGNOSTIC @@ -320,13 +279,10 @@ buf_put(struct buf *bp) panic("buf_put: b_dep is not empty"); #endif LIST_REMOVE(bp, b_list); + bcstats.numbufs--; - if (bp->b_data != NULL) { - bremhash(bp); - numbufpages -= atop(bp->b_bufsize); - uvm_km_free(buf_map, (vaddr_t)bp->b_data, bp->b_bufsize); - } - + if (buf_dealloc_mem(bp) != 0) + return; pool_put(&bufpool, bp); } @@ -336,39 +292,56 @@ buf_put(struct buf *bp) void bufinit(void) { - vaddr_t minaddr, maxaddr; struct bqueues *dp; + /* XXX - for now */ + bufpages = bufcachepercent = bufkvm = 0; + + /* + * If MD code doesn't say otherwise, use 10% of kvm for mappings and + * 10% physmem for pages. + */ + if (bufcachepercent == 0) + bufcachepercent = 10; + if (bufpages == 0) + bufpages = physmem * bufcachepercent / 100; + + if (bufkvm == 0) + bufkvm = (VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) / 10; + + /* + * Don't use more than twice the amount of bufpages for mappings. + * It's twice since we map things sparsely. + */ + if (bufkvm > bufpages * PAGE_SIZE) + bufkvm = bufpages * PAGE_SIZE; + /* + * Round bufkvm to MAXPHYS because we allocate chunks of va space + * in MAXPHYS chunks. + */ + bufkvm &= ~(MAXPHYS - 1); + pool_init(&bufpool, sizeof(struct buf), 0, 0, 0, "bufpl", NULL); pool_setipl(&bufpool, IPL_BIO); for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) TAILQ_INIT(dp); - minaddr = vm_map_min(kernel_map); - buf_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr, - ptoa(bufpages), 0, FALSE, NULL); - /* - * XXX don't starve any one queue below 5% of the total number - * of buffer cache pages. - */ - bqpagelow = bufpages / 20; + /* + * hmm - bufkvm is an argument because it's static, while + * bufpages is global because it can change while running. + */ + buf_mem_init(bufkvm); bufhashtbl = hashinit(bufpages / 4, M_CACHE, M_WAITOK, &bufhash); hidirtypages = (bufpages / 4) * 3; lodirtypages = bufpages / 2; /* - * Reserve 5% of bufpages for syncer's needs, - * but not more than 25% and if possible - * not less than 2 * MAXBSIZE. locleanpages - * value must be not too small + * When we hit 95% of pages being clean, we bring them down to + * 90% to have some slack. */ - hicleanpages = bufpages / 2; - locleanpages = hicleanpages / 2; - if (locleanpages < atop(2 * MAXBSIZE)) - locleanpages = atop(2 * MAXBSIZE); - if (locleanpages > bufpages / 4) - locleanpages = bufpages / 4; + hicleanpages = bufpages - (bufpages / 20); + locleanpages = bufpages - (bufpages / 10); maxcleanpages = locleanpages; } @@ -388,8 +361,9 @@ bio_doread(struct vnode *vp, daddr64_t blkno, int size, int async) */ if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) { SET(bp->b_flags, B_READ | async); + bcstats.pendingreads++; + bcstats.numreads++; VOP_STRATEGY(bp); - /* Pay for the read. */ curproc->p_stats->p_ru.ru_inblock++; /* XXX */ } else if (async) { @@ -477,7 +451,7 @@ bread_cluster_callback(struct buf *bp) } free(xbpp, M_TEMP); - bp->b_data = NULL; + bp->b_pobj = NULL; buf_put(bp); } @@ -485,9 +459,8 @@ int bread_cluster(struct vnode *vp, daddr64_t blkno, int size, struct buf **rbpp) { struct buf *bp, **xbpp; - int howmany, i, maxra, inc; + int howmany, maxra, i, inc; daddr64_t sblkno; - size_t spill; *rbpp = bio_doread(vp, blkno, size, 0); @@ -544,11 +517,16 @@ bread_cluster(struct vnode *vp, daddr64_t blkno, int size, struct buf **rbpp) inc = btodb(size); for (i = 0; i < howmany; i++) { + bcstats.pendingreads++; + bcstats.numreads++; SET(xbpp[i]->b_flags, B_READ | B_ASYNC); binshash(xbpp[i], BUFHASH(vp, xbpp[i]->b_lblkno)); xbpp[i]->b_blkno = sblkno + (i * inc); xbpp[i]->b_bufsize = xbpp[i]->b_bcount = size; - xbpp[i]->b_data = bp->b_data + (i * size); + xbpp[i]->b_data = NULL; + xbpp[i]->b_pobj = bp->b_pobj; + xbpp[i]->b_poffs = bp->b_poffs + (i * size); + buf_acquire_unmapped(xbpp[i]); } bp->b_blkno = sblkno; @@ -557,12 +535,8 @@ bread_cluster(struct vnode *vp, daddr64_t blkno, int size, struct buf **rbpp) bp->b_saveaddr = (void *)xbpp; bp->b_iodone = bread_cluster_callback; bp->b_vp = vp; - spill = bp->b_bufsize - bp->b_bcount; - if (spill) { - uvm_km_free(buf_map, (vaddr_t) bp->b_data + bp->b_bcount, - spill); - numbufpages -= atop(spill); - } + bcstats.pendingreads++; + bcstats.numreads++; VOP_STRATEGY(bp); curproc->p_stats->p_ru.ru_inblock++; @@ -609,6 +583,8 @@ bwrite(struct buf *bp) else mp->mnt_stat.f_syncwrites++; } + bcstats.pendingwrites++; + bcstats.numwrites++; wasdelayed = ISSET(bp->b_flags, B_DELWRI); CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI)); @@ -723,6 +699,11 @@ buf_dirty(struct buf *bp) { splassert(IPL_BIO); +#ifdef DIAGNOSTIC + if (!ISSET(bp->b_flags, B_BUSY)) + panic("Trying to dirty buffer on freelist!"); +#endif + if (ISSET(bp->b_flags, B_DELWRI) == 0) { SET(bp->b_flags, B_DELWRI); bp->b_synctime = time_uptime + 35; @@ -738,6 +719,10 @@ buf_undirty(struct buf *bp) { splassert(IPL_BIO); +#ifdef DIAGNOSTIC + if (!ISSET(bp->b_flags, B_BUSY)) + panic("Trying to undirty buffer on freelist!"); +#endif if (ISSET(bp->b_flags, B_DELWRI)) { CLR(bp->b_flags, B_DELWRI); reassignbuf(bp); @@ -769,8 +754,6 @@ brelse(struct buf *bp) SET(bp->b_flags, B_INVAL); if (ISSET(bp->b_flags, B_INVAL)) { - int queue, qs; - /* * If the buffer is invalid, place it in the clean queue, so it * can be reused. @@ -789,37 +772,29 @@ brelse(struct buf *bp) * If the buffer has no associated data, place it back in the * pool. */ - if (bp->b_data == NULL) { + if (bp->b_data == NULL && bp->b_pobj == NULL) { buf_put(bp); splx(s); return; } - qs = bp->b_bufsize; - queue = size2cqueue(&qs); - numcleanpages += atop(bp->b_bufsize); - bqpages[queue] += atop(bp->b_bufsize); - if (maxcleanpages < numcleanpages) - maxcleanpages = numcleanpages; - binsheadfree(bp, &bufqueues[queue]); + bcstats.numcleanpages += atop(bp->b_bufsize); + if (maxcleanpages < bcstats.numcleanpages) + maxcleanpages = bcstats.numcleanpages; + binsheadfree(bp, &bufqueues[BQ_CLEAN]); } else { /* * It has valid data. Put it on the end of the appropriate * queue, so that it'll stick around for as long as possible. */ - int queue, qs; - numfreepages += atop(bp->b_bufsize); - qs = bp->b_bufsize; - queue = size2cqueue(&qs); if (!ISSET(bp->b_flags, B_DELWRI)) { - numcleanpages += atop(bp->b_bufsize); - bqpages[queue] += atop(bp->b_bufsize); - if (maxcleanpages < numcleanpages) - maxcleanpages = numcleanpages; - bufq = &bufqueues[queue]; + bcstats.numcleanpages += atop(bp->b_bufsize); + if (maxcleanpages < bcstats.numcleanpages) + maxcleanpages = bcstats.numcleanpages; + bufq = &bufqueues[BQ_CLEAN]; } else { - numdirtypages += atop(bp->b_bufsize); + bcstats.numdirtypages += atop(bp->b_bufsize); bufq = &bufqueues[BQ_DIRTY]; } if (ISSET(bp->b_flags, B_AGE)) { @@ -832,7 +807,9 @@ brelse(struct buf *bp) } /* Unlock the buffer. */ - CLR(bp->b_flags, (B_AGE | B_ASYNC | B_BUSY | B_NOCACHE | B_DEFERRED)); + bcstats.freebufs++; + CLR(bp->b_flags, (B_AGE | B_ASYNC | B_NOCACHE | B_DEFERRED)); + buf_release(bp); /* Wake up any processes waiting for any buffer to become free. */ if (needbuffer) { @@ -917,8 +894,10 @@ start: } if (!ISSET(bp->b_flags, B_INVAL)) { - SET(bp->b_flags, (B_BUSY | B_CACHE)); + bcstats.cachehits++; bremfree(bp); + SET(bp->b_flags, B_CACHE); + buf_acquire(bp); splx(s); break; } @@ -946,6 +925,10 @@ start: bgetvp(vp, bp); splx(s); } +#ifdef DIAGNOSTIC + if (!ISSET(bp->b_flags, B_BUSY)) + panic("getblk buffer not B_BUSY"); +#endif return (bp); } @@ -972,7 +955,7 @@ struct buf * getnewbuf(size_t size, int slpflag, int slptimeo, int *ep) { struct buf *bp; - int s, error, queue, qs; + int s; #if 0 /* we would really like this but sblock update kills it */ KASSERT(curproc != syncerproc && curproc != cleanerproc); @@ -982,72 +965,47 @@ getnewbuf(size_t size, int slpflag, int slptimeo, int *ep) /* * Wake up cleaner if we're getting low on pages. */ - if (numdirtypages >= hidirtypages || numcleanpages <= locleanpages) + if (bcstats.numdirtypages >= hidirtypages || bcstats.numcleanpages <= locleanpages) wakeup(&bd_req); + /* + * If we're above the high water mark for clean pages, + * free down to the low water mark. + */ + if (bcstats.numcleanpages > hicleanpages) { + while (bcstats.numcleanpages > locleanpages) { + bp = TAILQ_FIRST(&bufqueues[BQ_CLEAN]); + bremfree(bp); + if (bp->b_vp) + brelvp(bp); + bremhash(bp); + buf_put(bp); + } + } + /* we just ask. it can say no.. */ getsome: - qs = size; - queue = size2cqueue(&qs); - bp = buf_get(qs); /* XXX use qs instead and no need in buf_get? */ + bp = buf_get(size); if (bp == NULL) { - /* - * No free ones, try to reuse a clean one of the same or - * larger size. - */ - do { - bp = TAILQ_FIRST(&bufqueues[queue]); - queue++; - } while (bp == NULL && queue < BQUEUES); - } - if (bp == NULL) { - /* we couldn't reuse a free one, nothing of the right size */ - /* XXX free 20 buffers per q - ugly hack should really - * reuse big ones without truncating. fix later - */ - int q, gotsome = 0; - int freemax = 20; - for (q = 1; q < BQUEUES; q++) { - int i = freemax; - while (bqpages[q] > bqpagelow - && (bp = TAILQ_FIRST(&bufqueues[q])) - && i--) { - gotsome++; - bremfree(bp); - if (LIST_FIRST(&bp->b_dep) != NULL) - buf_deallocate(bp); - - if (ISSET(bp->b_flags, B_DELWRI)) { - CLR(bp->b_flags, B_DELWRI); - } - - if (bp->b_vp) - brelvp(bp); - - buf_put(bp); - } + int freemax = 5; + int i = freemax; + while ((bp = TAILQ_FIRST(&bufqueues[BQ_CLEAN])) && i--) { + bremfree(bp); + if (bp->b_vp) + brelvp(bp); + bremhash(bp); + buf_put(bp); } - if (gotsome) + if (freemax != i) goto getsome; - } - if (bp == NULL) { - /* wait for a free buffer of any kind */ - needbuffer++; - error = tsleep(&needbuffer, slpflag | (PRIBIO + 1), - "getnewbuf", slptimeo); - if (ep != NULL) { - *ep = error; - if (error) { - splx(s); - return (NULL); - } - } - goto getsome; + splx(s); + return (NULL); } bremfree(bp); /* Buffer is no longer on free lists. */ - SET(bp->b_flags, B_BUSY); + bp->b_flags = 0; + buf_acquire(bp); #ifdef DIAGNOSTIC if (ISSET(bp->b_flags, B_DELWRI)) @@ -1067,7 +1025,6 @@ getsome: #endif /* clear out various other fields */ - bp->b_flags = B_BUSY; bp->b_dev = NODEV; bp->b_blkno = bp->b_lblkno = 0; bp->b_iodone = NULL; @@ -1095,8 +1052,7 @@ buf_daemon(struct proc *p) s = splbio(); for (;;) { - if (!numdirtypages || - (numdirtypages < hidirtypages && !needbuffer)) + if (bcstats.numdirtypages < hidirtypages) tsleep(&bd_req, PRIBIO - 7, "cleaner", 0); getmicrouptime(&starttime); @@ -1104,11 +1060,11 @@ buf_daemon(struct proc *p) while ((bp = TAILQ_FIRST(&bufqueues[BQ_DIRTY]))) { struct timeval tv; - if (numdirtypages < lodirtypages && !needbuffer) + if (bcstats.numdirtypages < lodirtypages) break; bremfree(bp); - SET(bp->b_flags, B_BUSY); + buf_acquire(bp); splx(s); if (ISSET(bp->b_flags, B_INVAL)) { @@ -1125,10 +1081,10 @@ buf_daemon(struct proc *p) buf_countdeps(bp, 0, 0)) { SET(bp->b_flags, B_DEFERRED); s = splbio(); - numfreepages += atop(bp->b_bufsize); - numdirtypages += atop(bp->b_bufsize); + bcstats.numdirtypages += atop(bp->b_bufsize); binstailfree(bp, &bufqueues[BQ_DIRTY]); - CLR(bp->b_flags, B_BUSY); + bcstats.freebufs++; + buf_release(bp); continue; } @@ -1154,6 +1110,8 @@ biowait(struct buf *bp) { int s; + KASSERT(!(bp->b_flags & B_ASYNC)); + s = splbio(); while (!ISSET(bp->b_flags, B_DONE)) tsleep(bp, PRIBIO + 1, "biowait", 0); @@ -1203,8 +1161,11 @@ biodone(struct buf *bp) if (!ISSET(bp->b_flags, B_READ)) { CLR(bp->b_flags, B_WRITEINPROG); + bcstats.pendingwrites--; vwakeup(bp->b_vp); - } + } else if (bcstats.numbufs && + (!(ISSET(bp->b_flags, B_RAW) || ISSET(bp->b_flags, B_PHYS)))) + bcstats.pendingreads--; if (ISSET(bp->b_flags, B_CALL)) { /* if necessary, call out */ CLR(bp->b_flags, B_CALL); /* but note callout done */ @@ -1218,66 +1179,3 @@ biodone(struct buf *bp) } } } - -#if 1 -void -vfs_bufstats(void) { - return; -} -/* #ifdef DDB */ -#else -/* - * Print out statistics on the current allocation of the buffer pool. - * Can be enabled to print out on every ``sync'' by setting "syncprt" - * in vfs_syscalls.c using sysctl. - */ -void -vfs_bufstats(void) -{ - int s, i, j, count; - struct buf *bp; - struct bqueues *dp; - int counts[MAXBSIZE/PAGE_SIZE+1]; - int totals[BQUEUES]; - long ptotals[BQUEUES]; - long pages; - static char *bname[BQUEUES] = { "CLEAN", "DIRTY", "EMPTY" }; - - s = splbio(); - for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) { - count = 0; - pages = 0; - for (j = 0; j <= MAXBSIZE/PAGE_SIZE; j++) - counts[j] = 0; - TAILQ_FOREACH(bp, dp, b_freelist) { - counts[bp->b_bufsize/PAGE_SIZE]++; - count++; - pages += atop(bp->b_bufsize); - } - totals[i] = count; - ptotals[i] = pages; - printf("%s: total-%d(%d pages)", bname[i], count, pages); - for (j = 0; j <= MAXBSIZE/PAGE_SIZE; j++) - if (counts[j] != 0) - printf(", %d-%d", j * PAGE_SIZE, counts[j]); - printf("\n"); - } - if ((ptotals[BQ_CLEAN] + ptotals[BQ_DIRTY]) != numfreepages) - printf("numfreepages counter wrong: %ld != %ld\n", - numfreepages, ptotals[BQ_CLEAN] + ptotals[BQ_DIRTY]); - if (ptotals[BQ_CLEAN] != numcleanpages) - printf("numcleanpages counter wrong: %ld != %ld\n", - numcleanpages, ptotals[<BQ_CLEAN]); - else - printf("numcleanpages: %ld\n", numcleanpages); - if (numdirtypages != ptotals[BQ_DIRTY]) - printf("numdirtypages counter wrong: %ld != %ld\n", - numdirtypages, ptotals[BQ_DIRTY]); - else - printf("numdirtypages: %ld\n", numdirtypages); - - printf("syncer eating up to %ld pages from %ld reserved\n", - maxcleanpages - hicleanpages, locleanpages); - splx(s); -} -#endif /* DEBUG */ diff --git a/sys/kern/vfs_biomem.c b/sys/kern/vfs_biomem.c new file mode 100644 index 00000000000..ccda55290fb --- /dev/null +++ b/sys/kern/vfs_biomem.c @@ -0,0 +1,305 @@ +/* + * Copyright (c) 2007 Artur Grabowski <art@openbsd.org> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/buf.h> +#include <sys/pool.h> +#include <sys/mount.h> + +#include <uvm/uvm_extern.h> +#include <uvm/uvm.h> + +vaddr_t buf_kva_start, buf_kva_end; +int buf_needva; +TAILQ_HEAD(,buf) buf_valist; + +int buf_nkvmsleep; + +extern struct bcachestats bcstats; + +/* + * Pages are allocated from a uvm object (we only use it for page storage, + * all pages are wired). Since every buffer contains a contiguous range of + * pages, reusing the pages could be very painful. Fortunately voff_t is + * 64 bits, so we can just increment buf_page_offset all the time and ignore + * wraparound. Even if you reuse 4GB worth of buffers every second + * you'll still run out of time_t faster than buffers. + * + * XXX - the spl locking in here is extreme paranoia right now until I figure + * it all out. + */ +voff_t buf_page_offset; +struct uvm_object *buf_object, buf_object_store; + +vaddr_t buf_unmap(struct buf *); + +void +buf_mem_init(vsize_t size) +{ + TAILQ_INIT(&buf_valist); + + buf_kva_start = vm_map_min(kernel_map); + if (uvm_map(kernel_map, &buf_kva_start, size, NULL, + UVM_UNKNOWN_OFFSET, PAGE_SIZE, UVM_MAPFLAG(UVM_PROT_NONE, + UVM_PROT_NONE, UVM_INH_NONE, UVM_ADV_NORMAL, 0))) + panic("bufinit: can't reserve VM for buffers"); + buf_kva_end = buf_kva_start + size; + + buf_object = &buf_object_store; + + buf_object->pgops = NULL; + TAILQ_INIT(&buf_object->memq); + buf_object->uo_npages = 0; + buf_object->uo_refs = 1; +} + +/* + * buf_acquire and buf_release manage the kvm mappings of buffers. + */ +void +buf_acquire(struct buf *bp) +{ + vaddr_t va; + int s; + + KASSERT((bp->b_flags & B_BUSY) == 0); + + s = splbio(); + /* + * Busy before waiting for kvm. + */ + SET(bp->b_flags, B_BUSY); + + if (bp->b_data == NULL) { + unsigned long i; + + /* + * First, just use the pre-allocated space until we run out. + */ + if (buf_kva_start < buf_kva_end) { + va = buf_kva_start; + buf_kva_start += MAXPHYS; + } else { + struct buf *vbp; + + /* + * Find some buffer we can steal the space from. + */ + while ((vbp = TAILQ_FIRST(&buf_valist)) == NULL) { + buf_needva++; + buf_nkvmsleep++; + tsleep(&buf_needva, PRIBIO, "buf_needva", 0); + } + va = buf_unmap(vbp); + } + + for (i = 0; i < atop(bp->b_bufsize); i++) { + struct vm_page *pg = uvm_pagelookup(bp->b_pobj, + bp->b_poffs + ptoa(i)); + + KASSERT(pg != NULL); + + pmap_kenter_pa(va + ptoa(i), VM_PAGE_TO_PHYS(pg), + VM_PROT_READ|VM_PROT_WRITE); + pmap_update(pmap_kernel()); + } + bp->b_data = (caddr_t)va; + } else { + TAILQ_REMOVE(&buf_valist, bp, b_valist); + } + splx(s); +} + +/* + * Busy a buffer, but don't map it. + * If it has a mapping, we keep it, but we also keep the mapping on + * the list since we assume that it won't be used anymore. + */ +void +buf_acquire_unmapped(struct buf *bp) +{ + int s; + + s = splbio(); + SET(bp->b_flags, B_BUSY|B_NOTMAPPED); + splx(s); +} + +void +buf_release(struct buf *bp) +{ + int s; + + KASSERT(bp->b_flags & B_BUSY); + KASSERT((bp->b_data != NULL) || (bp->b_flags & B_NOTMAPPED)); + + s = splbio(); + if (bp->b_data) { + TAILQ_INSERT_TAIL(&buf_valist, bp, b_valist); + if (buf_needva) { + buf_needva--; + wakeup_one(&buf_needva); + } + } + CLR(bp->b_flags, B_BUSY|B_NOTMAPPED); + splx(s); +} + +/* + * Deallocate all memory resources for this buffer. We need to be careful + * to not drop kvm since we have no way to reclaim it. So, if the buffer + * has kvm, we need to free it later. We put it on the front of the + * freelist just so it gets picked up faster. + * + * Also, lots of assertions count on bp->b_data being NULL, so we + * set it temporarily to NULL. + * + * Return non-zero if we take care of the freeing later. + */ +int +buf_dealloc_mem(struct buf *bp) +{ + caddr_t data = bp->b_data; + int s; + + s = splbio(); + + bp->b_data = NULL; + + if (bp->b_pobj) { + if (data) { + pmap_kremove((vaddr_t)data, bp->b_bufsize); + pmap_update(pmap_kernel()); + } + buf_free_pages(bp); + } + + if (data == NULL) { + splx(s); + return (0); + } + + bp->b_data = data; + if (!(bp->b_flags & B_BUSY)) /* XXX - need better test */ + TAILQ_REMOVE(&buf_valist, bp, b_valist); + else + CLR(bp->b_flags, B_BUSY); + SET(bp->b_flags, B_RELEASED); + TAILQ_INSERT_HEAD(&buf_valist, bp, b_valist); + + splx(s); + + return (1); +} + +vaddr_t +buf_unmap(struct buf *bp) +{ + vaddr_t va; + int s; + + KASSERT((bp->b_flags & B_BUSY) == 0); + KASSERT(bp->b_data != NULL); + + s = splbio(); + TAILQ_REMOVE(&buf_valist, bp, b_valist); + va = (vaddr_t)bp->b_data; + bp->b_data = 0; + pmap_kremove(va, bp->b_bufsize); + pmap_update(pmap_kernel()); + + if (bp->b_flags & B_RELEASED) + pool_put(&bufpool, bp); + + splx(s); + + return (va); +} + +void +buf_alloc_pages(struct buf *bp, vsize_t size) +{ + struct vm_page *pg; + voff_t offs, i; + int s; + + KASSERT(size == round_page(size)); + KASSERT(bp->b_pobj == NULL); + KASSERT(bp->b_data == NULL); + + s = splbio(); + + offs = buf_page_offset; + buf_page_offset += size; + + KASSERT(buf_page_offset > 0); + + for (i = 0; i < atop(size); i++) { +#if defined(DEBUG) || 1 + if ((pg = uvm_pagelookup(buf_object, offs + ptoa(i)))) + panic("buf_alloc_pages: overlap buf: %p page: %p", + bp, pg); +#endif + + while ((pg = uvm_pagealloc(buf_object, offs + ptoa(i), + NULL, 0)) == NULL) { + uvm_wait("buf_alloc_pages"); + } + pg->wire_count = 1; + atomic_clearbits_int(&pg->pg_flags, PG_BUSY); + bcstats.numbufpages++; + } + + bp->b_pobj = buf_object; + bp->b_poffs = offs; + bp->b_bufsize = size; + splx(s); +} + +void +buf_free_pages(struct buf *bp) +{ + struct uvm_object *uobj = bp->b_pobj; + struct vm_page *pg; + voff_t off, i; + int s; + + KASSERT(bp->b_data == NULL); + KASSERT(uobj != NULL); + + s = splbio(); + + off = bp->b_poffs; + bp->b_pobj = NULL; + bp->b_poffs = 0; + + for (i = 0; i < (bp->b_bufsize >> PAGE_SHIFT); i++) { + pg = uvm_pagelookup(uobj, off + (i * PAGE_SIZE)); + KASSERT(pg != NULL); + KASSERT(pg->wire_count == 1); + pg->wire_count = 0; + uvm_pagefree(pg); + bcstats.numbufpages--; + } + splx(s); +} + +/* + * XXX - it might make sense to make a buf_realloc_pages to avoid + * bouncing through the free list all the time. + */ diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index 3ac320e5bc1..65153b0cdfc 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -1,4 +1,4 @@ -/* $OpenBSD: vfs_subr.c,v 1.167 2008/06/09 23:38:37 millert Exp $ */ +/* $OpenBSD: vfs_subr.c,v 1.168 2008/06/10 20:14:36 beck Exp $ */ /* $NetBSD: vfs_subr.c,v 1.53 1996/04/22 01:39:13 christos Exp $ */ /* @@ -120,7 +120,7 @@ void vntblinit(void) { /* buffer cache may need a vnode for each buffer */ - maxvnodes = desiredvnodes; + maxvnodes = bufpages; pool_init(&vnode_pool, sizeof(struct vnode), 0, 0, 0, "vnodes", &pool_allocator_nointr); TAILQ_INIT(&vnode_hold_list); @@ -1256,8 +1256,12 @@ vfs_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, free(tmpvfsp, M_TEMP); return (ret); - } + case VFS_BCACHESTAT: /* buffer cache statistics */ + ret = sysctl_rdstruct(oldp, oldlenp, newp, &bcstats, + sizeof(struct bcachestats)); + return(ret); + } return (EOPNOTSUPP); } @@ -1664,7 +1668,7 @@ vfs_syncwait(int verbose) if (bp->b_flags & B_DELWRI) { s = splbio(); bremfree(bp); - bp->b_flags |= B_BUSY; + buf_acquire(bp); splx(s); nbusy++; bawrite(bp); @@ -1835,7 +1839,7 @@ loop: break; } bremfree(bp); - bp->b_flags |= B_BUSY; + buf_acquire(bp); /* * XXX Since there are no node locks for NFS, I believe * there is a slight chance that a delayed write will @@ -1873,7 +1877,7 @@ loop: if ((bp->b_flags & B_DELWRI) == 0) panic("vflushbuf: not dirty"); bremfree(bp); - bp->b_flags |= B_BUSY; + buf_acquire(bp); splx(s); /* * Wait for I/O associated with indirect blocks to complete, diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c index e175105abc1..3af4f6c1a10 100644 --- a/sys/miscfs/specfs/spec_vnops.c +++ b/sys/miscfs/specfs/spec_vnops.c @@ -1,4 +1,4 @@ -/* $OpenBSD: spec_vnops.c,v 1.49 2008/05/08 17:45:45 thib Exp $ */ +/* $OpenBSD: spec_vnops.c,v 1.50 2008/06/10 20:14:36 beck Exp $ */ /* $NetBSD: spec_vnops.c,v 1.29 1996/04/22 01:42:38 christos Exp $ */ /* @@ -447,7 +447,7 @@ loop: if ((bp->b_flags & B_DELWRI) == 0) panic("spec_fsync: not dirty"); bremfree(bp); - bp->b_flags |= B_BUSY; + buf_acquire(bp); splx(s); bawrite(bp); goto loop; diff --git a/sys/nfs/nfs_syscalls.c b/sys/nfs/nfs_syscalls.c index 5b29ccd0935..007933590d7 100644 --- a/sys/nfs/nfs_syscalls.c +++ b/sys/nfs/nfs_syscalls.c @@ -1,4 +1,4 @@ -/* $OpenBSD: nfs_syscalls.c,v 1.59 2008/05/02 13:26:27 thib Exp $ */ +/* $OpenBSD: nfs_syscalls.c,v 1.60 2008/06/10 20:14:37 beck Exp $ */ /* $NetBSD: nfs_syscalls.c,v 1.19 1996/02/18 11:53:52 fvdl Exp $ */ /* @@ -748,7 +748,8 @@ nfssvc_iod(p) (B_BUSY|B_DELWRI|B_NEEDCOMMIT|B_NOCACHE))!=B_DELWRI) continue; bremfree(nbp); - nbp->b_flags |= (B_BUSY|B_ASYNC); + nbp->b_flags |= B_ASYNC; + buf_acquire(nbp); break; } /* diff --git a/sys/nfs/nfs_vnops.c b/sys/nfs/nfs_vnops.c index be30de9f5f4..d26824f8b0e 100644 --- a/sys/nfs/nfs_vnops.c +++ b/sys/nfs/nfs_vnops.c @@ -1,4 +1,4 @@ -/* $OpenBSD: nfs_vnops.c,v 1.84 2008/06/09 23:38:37 millert Exp $ */ +/* $OpenBSD: nfs_vnops.c,v 1.85 2008/06/10 20:14:37 beck Exp $ */ /* $NetBSD: nfs_vnops.c,v 1.62.4.1 1996/07/08 20:26:52 jtc Exp $ */ /* @@ -2675,7 +2675,8 @@ again: != (B_DELWRI | B_NEEDCOMMIT)) continue; bremfree(bp); - bp->b_flags |= (B_BUSY | B_WRITEINPROG); + bp->b_flags |= B_WRITEINPROG; + buf_acquire(bp); /* * A list of these buffers is kept so that the * second loop knows which buffers have actually @@ -2753,10 +2754,12 @@ loop: if ((passone || !commit) && (bp->b_flags & B_NEEDCOMMIT)) continue; bremfree(bp); - if (passone || !commit) - bp->b_flags |= (B_BUSY|B_ASYNC); - else - bp->b_flags |= (B_BUSY|B_ASYNC|B_WRITEINPROG|B_NEEDCOMMIT); + if (passone || !commit) { + bp->b_flags |= B_ASYNC; + } else { + bp->b_flags |= (B_ASYNC|B_WRITEINPROG|B_NEEDCOMMIT); + } + buf_acquire(bp); splx(s); VOP_BWRITE(bp); goto loop; @@ -2952,6 +2955,8 @@ nfs_writebp(bp, force) if (retv) { if (force) bp->b_flags |= B_WRITEINPROG; + bcstats.pendingwrites++; + bcstats.numwrites++; VOP_STRATEGY(bp); } diff --git a/sys/scsi/cd.c b/sys/scsi/cd.c index 52cde028c4c..bbc1bbb25ba 100644 --- a/sys/scsi/cd.c +++ b/sys/scsi/cd.c @@ -1,4 +1,4 @@ -/* $OpenBSD: cd.c,v 1.135 2008/05/27 11:39:22 fgsch Exp $ */ +/* $OpenBSD: cd.c,v 1.136 2008/06/10 20:14:37 beck Exp $ */ /* $NetBSD: cd.c,v 1.100 1997/04/02 02:29:30 mycroft Exp $ */ /* @@ -82,6 +82,8 @@ #define CD_FRAMES 75 #define CD_SECS 60 +#define CD_LOCKED 0x0800 + struct cd_toc { struct ioc_toc_header header; struct cd_toc_entry entries[MAXTRACK+1]; /* One extra for the */ diff --git a/sys/sys/buf.h b/sys/sys/buf.h index c301148a9b2..50edefe61f4 100644 --- a/sys/sys/buf.h +++ b/sys/sys/buf.h @@ -1,4 +1,4 @@ -/* $OpenBSD: buf.h,v 1.57 2007/05/28 18:08:47 pedro Exp $ */ +/* $OpenBSD: buf.h,v 1.58 2008/06/10 20:14:37 beck Exp $ */ /* $NetBSD: buf.h,v 1.25 1997/04/09 21:12:17 mycroft Exp $ */ /* @@ -87,6 +87,12 @@ struct buf { dev_t b_dev; /* Device associated with buffer. */ caddr_t b_data; /* associated data */ void *b_saveaddr; /* Original b_data for physio. */ + + TAILQ_ENTRY(buf) b_valist; /* LRU of va to reuse. */ + + struct uvm_object *b_pobj; /* Object containing the pages */ + off_t b_poffs; /* Offset within object */ + daddr64_t b_lblkno; /* Logical block number. */ daddr64_t b_blkno; /* Underlying physical block number. */ /* Function to call upon completion. @@ -161,6 +167,8 @@ struct buf *bufq_default_get(struct bufq *); #define B_DEFERRED 0x04000000 /* Skipped over for cleaning */ #define B_SCANNED 0x08000000 /* Block already pushed during sync */ #define B_PDAEMON 0x10000000 /* I/O started by pagedaemon */ +#define B_RELEASED 0x20000000 /* free this buffer after its kvm */ +#define B_NOTMAPPED 0x40000000 /* BUSY, but not necessarily mapped */ #define B_BITS "\010\001AGE\002NEEDCOMMIT\003ASYNC\004BAD\005BUSY\006CACHE" \ "\007CALL\010DELWRI\012DONE\013EINTR\014ERROR" \ @@ -228,6 +236,23 @@ struct buf *getblk(struct vnode *, daddr64_t, int, int, int); struct buf *geteblk(int); struct buf *incore(struct vnode *, daddr64_t); +/* + * buf_kvm_init initializes the kvm handling for buffers. + * buf_acquire sets the B_BUSY flag and ensures that the buffer is + * mapped in the kvm. + * buf_release clears the B_BUSY flag and allows the buffer to become + * unmapped. + * buf_unmap is for internal use only. Unmaps the buffer from kvm. + */ +void buf_mem_init(vsize_t); +void buf_acquire(struct buf *); +void buf_acquire_unmapped(struct buf *); +void buf_release(struct buf *); +int buf_dealloc_mem(struct buf *); +void buf_alloc_pages(struct buf *, vsize_t); +void buf_free_pages(struct buf *); + + void minphys(struct buf *bp); int physio(void (*strategy)(struct buf *), struct buf *bp, dev_t dev, int flags, void (*minphys)(struct buf *), struct uio *uio); diff --git a/sys/sys/mount.h b/sys/sys/mount.h index 09a6d67ee2f..64e01badfef 100644 --- a/sys/sys/mount.h +++ b/sys/sys/mount.h @@ -1,4 +1,4 @@ -/* $OpenBSD: mount.h,v 1.84 2008/05/07 14:09:36 thib Exp $ */ +/* $OpenBSD: mount.h,v 1.85 2008/06/10 20:14:37 beck Exp $ */ /* $NetBSD: mount.h,v 1.48 1996/02/18 11:55:47 fvdl Exp $ */ /* @@ -465,10 +465,13 @@ struct mount { #define VFS_MAXTYPENUM 1 /* int: highest defined filesystem type */ #define VFS_CONF 2 /* struct: vfsconf for filesystem given as next argument */ +#define VFS_BCACHESTAT 3 /* struct: buffer cache statistics given + as next argument */ #define CTL_VFSGENCTL_NAMES { \ { 0, 0 }, \ { "maxtypenum", CTLTYPE_INT }, \ - { "conf", CTLTYPE_NODE } \ + { "conf", CTLTYPE_NODE }, \ + { "bcachestat", CTLTYPE_STRUCT } \ } /* @@ -485,6 +488,24 @@ struct vfsconf { struct vfsconf *vfc_next; /* next in list */ }; +/* buffer cache statistics */ +struct bcachestats { + long numbufs; /* number of buffers allocated */ + long freebufs; /* number of free buffers */ + long numbufpages; /* number of pages in buffer cache */ + long numfreepages; /* number of free pages */ + long numdirtypages; /* number of dirty free pages */ + long numcleanpages; /* number of clean free pages */ + long pendingwrites; /* number of pending writes */ + long pendingreads; /* number of pending reads */ + long numwrites; /* total writes started */ + long numreads; /* total reads started */ + long cachehits; /* total reads found in cache */ +}; +#ifdef _KERNEL +extern struct bcachestats bcstats; +#endif + /* * Operations supported on mounted file system. */ diff --git a/sys/ufs/ext2fs/ext2fs_bmap.c b/sys/ufs/ext2fs/ext2fs_bmap.c index dc7acb7d213..5de33f44a8f 100644 --- a/sys/ufs/ext2fs/ext2fs_bmap.c +++ b/sys/ufs/ext2fs/ext2fs_bmap.c @@ -1,4 +1,4 @@ -/* $OpenBSD: ext2fs_bmap.c,v 1.15 2007/06/17 20:15:25 jasper Exp $ */ +/* $OpenBSD: ext2fs_bmap.c,v 1.16 2008/06/10 20:14:37 beck Exp $ */ /* $NetBSD: ext2fs_bmap.c,v 1.5 2000/03/30 12:41:11 augustss Exp $ */ /* @@ -190,6 +190,7 @@ ext2fs_bmaparray(struct vnode *vp, int32_t bn, daddr64_t *bnp, bp->b_flags |= B_READ; VOP_STRATEGY(bp); curproc->p_stats->p_ru.ru_inblock++; /* XXX */ + bcstats.pendingreads++; if ((error = biowait(bp)) != 0) { brelse(bp); return (error); diff --git a/sys/ufs/ext2fs/ext2fs_inode.c b/sys/ufs/ext2fs/ext2fs_inode.c index a78632ce518..a974b9f0874 100644 --- a/sys/ufs/ext2fs/ext2fs_inode.c +++ b/sys/ufs/ext2fs/ext2fs_inode.c @@ -1,4 +1,4 @@ -/* $OpenBSD: ext2fs_inode.c,v 1.39 2007/10/29 17:06:20 chl Exp $ */ +/* $OpenBSD: ext2fs_inode.c,v 1.40 2008/06/10 20:14:37 beck Exp $ */ /* $NetBSD: ext2fs_inode.c,v 1.24 2001/06/19 12:59:18 wiz Exp $ */ /* @@ -466,6 +466,8 @@ ext2fs_indirtrunc(struct inode *ip, int32_t lbn, int32_t dbn, int32_t lastbn, in bp = getblk(vp, lbn, (int)fs->e2fs_bsize, 0, 0); if (!(bp->b_flags & (B_DONE | B_DELWRI))) { curproc->p_stats->p_ru.ru_inblock++; /* pay for read */ + bcstats.pendingreads++; + bcstats.numreads++; bp->b_flags |= B_READ; if (bp->b_bcount > bp->b_bufsize) panic("ext2fs_indirtrunc: bad buffer size"); diff --git a/sys/ufs/ffs/ffs_inode.c b/sys/ufs/ffs/ffs_inode.c index 6105983adbf..4021a9da0fc 100644 --- a/sys/ufs/ffs/ffs_inode.c +++ b/sys/ufs/ffs/ffs_inode.c @@ -1,4 +1,4 @@ -/* $OpenBSD: ffs_inode.c,v 1.52 2008/01/05 19:49:26 otto Exp $ */ +/* $OpenBSD: ffs_inode.c,v 1.53 2008/06/10 20:14:37 beck Exp $ */ /* $NetBSD: ffs_inode.c,v 1.10 1996/05/11 18:27:19 mycroft Exp $ */ /* @@ -501,6 +501,8 @@ ffs_indirtrunc(struct inode *ip, daddr64_t lbn, daddr64_t dbn, bp = getblk(vp, lbn, (int)fs->fs_bsize, 0, 0); if (!(bp->b_flags & (B_DONE | B_DELWRI))) { curproc->p_stats->p_ru.ru_inblock++; /* pay for read */ + bcstats.pendingreads++; + bcstats.numreads++; bp->b_flags |= B_READ; if (bp->b_bcount > bp->b_bufsize) panic("ffs_indirtrunc: bad buffer size"); diff --git a/sys/ufs/ffs/ffs_softdep.c b/sys/ufs/ffs/ffs_softdep.c index e2f88051a8a..8b74fef005e 100644 --- a/sys/ufs/ffs/ffs_softdep.c +++ b/sys/ufs/ffs/ffs_softdep.c @@ -1,4 +1,4 @@ -/* $OpenBSD: ffs_softdep.c,v 1.94 2008/01/05 19:49:26 otto Exp $ */ +/* $OpenBSD: ffs_softdep.c,v 1.95 2008/06/10 20:14:37 beck Exp $ */ /* * Copyright 1998, 2000 Marshall Kirk McKusick. All Rights Reserved. @@ -4691,9 +4691,9 @@ softdep_fsync_mountdev(vp, waitfor) /* * If it is already scheduled, skip to the next buffer. */ + splassert(IPL_BIO); if (bp->b_flags & B_BUSY) continue; - bp->b_flags |= B_BUSY; if ((bp->b_flags & B_DELWRI) == 0) { FREE_LOCK(&lk); @@ -4705,10 +4705,10 @@ softdep_fsync_mountdev(vp, waitfor) */ if ((wk = LIST_FIRST(&bp->b_dep)) == NULL || wk->wk_type != D_BMSAFEMAP) { - bp->b_flags &= ~B_BUSY; continue; } bremfree(bp); + buf_acquire(bp); FREE_LOCK(&lk); (void) bawrite(bp); ACQUIRE_LOCK(&lk); @@ -5616,7 +5616,7 @@ getdirtybuf(bp, waitfor) if ((bp->b_flags & B_DELWRI) == 0) return (0); bremfree(bp); - bp->b_flags |= B_BUSY; + buf_acquire(bp); return (1); } diff --git a/sys/ufs/ffs/ffs_vnops.c b/sys/ufs/ffs/ffs_vnops.c index 9e08f8b475f..489a4880473 100644 --- a/sys/ufs/ffs/ffs_vnops.c +++ b/sys/ufs/ffs/ffs_vnops.c @@ -1,4 +1,4 @@ -/* $OpenBSD: ffs_vnops.c,v 1.48 2008/05/08 17:45:45 thib Exp $ */ +/* $OpenBSD: ffs_vnops.c,v 1.49 2008/06/10 20:14:37 beck Exp $ */ /* $NetBSD: ffs_vnops.c,v 1.7 1996/05/11 18:27:24 mycroft Exp $ */ /* @@ -449,7 +449,8 @@ loop: } bremfree(bp); - bp->b_flags |= B_BUSY | B_SCANNED; + buf_acquire(bp); + bp->b_flags |= B_SCANNED; splx(s); /* * On our final pass through, do all I/O synchronously diff --git a/sys/ufs/ufs/ufs_bmap.c b/sys/ufs/ufs/ufs_bmap.c index fa22c0934ec..75205c56bc3 100644 --- a/sys/ufs/ufs/ufs_bmap.c +++ b/sys/ufs/ufs/ufs_bmap.c @@ -1,4 +1,4 @@ -/* $OpenBSD: ufs_bmap.c,v 1.26 2008/01/05 19:49:26 otto Exp $ */ +/* $OpenBSD: ufs_bmap.c,v 1.27 2008/06/10 20:14:37 beck Exp $ */ /* $NetBSD: ufs_bmap.c,v 1.3 1996/02/09 22:36:00 christos Exp $ */ /* @@ -173,6 +173,8 @@ ufs_bmaparray(struct vnode *vp, daddr64_t bn, daddr64_t *bnp, struct indir *ap, else { bp->b_blkno = blkptrtodb(ump, daddr); bp->b_flags |= B_READ; + bcstats.pendingreads++; + bcstats.numreads++; VOP_STRATEGY(bp); curproc->p_stats->p_ru.ru_inblock++; /* XXX */ if ((error = biowait(bp)) != 0) { diff --git a/sys/ufs/ufs/ufs_dirhash.c b/sys/ufs/ufs/ufs_dirhash.c index d6ebabe70fe..5391c977caa 100644 --- a/sys/ufs/ufs/ufs_dirhash.c +++ b/sys/ufs/ufs/ufs_dirhash.c @@ -1,4 +1,4 @@ -/* $OpenBSD: ufs_dirhash.c,v 1.17 2007/10/08 19:26:48 krw Exp $ */ +/* $OpenBSD: ufs_dirhash.c,v 1.18 2008/06/10 20:14:37 beck Exp $ */ /* * Copyright (c) 2001, 2002 Ian Dowse. All rights reserved. * @@ -211,7 +211,6 @@ ufsdirhash_build(struct inode *ip) if (UFS_BUFATOFF(ip, (off_t)pos, NULL, &bp) != 0) goto fail; } - /* Add this entry to the hash. */ ep = (struct direct *)((char *)bp->b_data + (pos & bmask)); if (ep->d_reclen == 0 || ep->d_reclen > |