summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBob Beck <beck@cvs.openbsd.org>2008-06-10 20:14:38 +0000
committerBob Beck <beck@cvs.openbsd.org>2008-06-10 20:14:38 +0000
commitb24b61b32a9fedc5de53427f16b7eb72e27e858d (patch)
tree7fb0535d861bd759c5fd3ec6d937d5aa85b4aeda
parentcb3bb3cecc1d0474c3e63d1cdf36ba8c39fdf873 (diff)
Buffer cache revamp
1) remove multiple size queues, introduced as a stopgap. 2) decouple pages containing data from their mappings 3) only keep buffers mapped when they actually have to be mapped (right now, this is when buffers are B_BUSY) 4) New functions to make a buffer busy, and release the busy flag (buf_acquire and buf_release) 5) Move high/low water marks and statistics counters into a structure 6) Add a sysctl to retrieve buffer cache statistics Tested in several variants and beat upon by bob and art for a year. run accidentally on henning's nfs server for a few months... ok deraadt@, krw@, art@ - who promises to be around to deal with any fallout
-rw-r--r--sys/conf/files3
-rw-r--r--sys/isofs/udf/udf_subr.c6
-rw-r--r--sys/kern/kern_sched.c4
-rw-r--r--sys/kern/spec_vnops.c4
-rw-r--r--sys/kern/subr_disk.c4
-rw-r--r--sys/kern/vfs_bio.c418
-rw-r--r--sys/kern/vfs_biomem.c305
-rw-r--r--sys/kern/vfs_subr.c16
-rw-r--r--sys/miscfs/specfs/spec_vnops.c4
-rw-r--r--sys/nfs/nfs_syscalls.c5
-rw-r--r--sys/nfs/nfs_vnops.c17
-rw-r--r--sys/scsi/cd.c4
-rw-r--r--sys/sys/buf.h27
-rw-r--r--sys/sys/mount.h25
-rw-r--r--sys/ufs/ext2fs/ext2fs_bmap.c3
-rw-r--r--sys/ufs/ext2fs/ext2fs_inode.c4
-rw-r--r--sys/ufs/ffs/ffs_inode.c4
-rw-r--r--sys/ufs/ffs/ffs_softdep.c8
-rw-r--r--sys/ufs/ffs/ffs_vnops.c5
-rw-r--r--sys/ufs/ufs/ufs_bmap.c4
-rw-r--r--sys/ufs/ufs/ufs_dirhash.c3
21 files changed, 572 insertions, 301 deletions
diff --git a/sys/conf/files b/sys/conf/files
index 958ab8fd5fd..ce464781f75 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -1,4 +1,4 @@
-# $OpenBSD: files,v 1.432 2008/06/09 22:47:41 djm Exp $
+# $OpenBSD: files,v 1.433 2008/06/10 20:14:36 beck Exp $
# $NetBSD: files,v 1.87 1996/05/19 17:17:50 jonathan Exp $
# @(#)files.newconf 7.5 (Berkeley) 5/10/93
@@ -707,6 +707,7 @@ file kern/uipc_socket2.c
file kern/uipc_syscalls.c
file kern/uipc_usrreq.c
file kern/vfs_bio.c
+file kern/vfs_biomem.c
file kern/vfs_cache.c
file kern/vfs_cluster.c
file kern/vfs_conf.c
diff --git a/sys/isofs/udf/udf_subr.c b/sys/isofs/udf/udf_subr.c
index 1c24b583be7..b81da604b1f 100644
--- a/sys/isofs/udf/udf_subr.c
+++ b/sys/isofs/udf/udf_subr.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: udf_subr.c,v 1.13 2007/06/08 05:35:32 deraadt Exp $ */
+/* $OpenBSD: udf_subr.c,v 1.14 2008/06/10 20:14:36 beck Exp $ */
/*
* Copyright (c) 2006, Miodrag Vallat
@@ -110,7 +110,7 @@ udf_disklabelspoof(dev_t dev, void (*strat)(struct buf *),
*/
bp->b_blkno = sector * btodb(bsize);
bp->b_bcount = bsize;
- bp->b_flags = B_BUSY | B_READ;
+ bp->b_flags |= B_READ;
bp->b_resid = bp->b_blkno / lp->d_secpercyl;
(*strat)(bp);
@@ -130,7 +130,7 @@ udf_disklabelspoof(dev_t dev, void (*strat)(struct buf *),
for (sector = mvds_start; sector < mvds_end; sector++) {
bp->b_blkno = sector * btodb(bsize);
bp->b_bcount = bsize;
- bp->b_flags = B_BUSY | B_READ;
+ bp->b_flags |= B_READ;
bp->b_resid = bp->b_blkno / lp->d_secpercyl;
(*strat)(bp);
diff --git a/sys/kern/kern_sched.c b/sys/kern/kern_sched.c
index 3357da7c492..0759e01f2b8 100644
--- a/sys/kern/kern_sched.c
+++ b/sys/kern/kern_sched.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: kern_sched.c,v 1.3 2008/06/08 20:13:13 thib Exp $ */
+/* $OpenBSD: kern_sched.c,v 1.4 2008/06/10 20:14:36 beck Exp $ */
/*
* Copyright (c) 2007 Artur Grabowski <art@openbsd.org>
*
@@ -113,6 +113,8 @@ sched_idle(void *v)
}
}
+ splassert(IPL_NONE);
+
cpu_idle_enter();
while (sched_is_idle())
cpu_idle_cycle();
diff --git a/sys/kern/spec_vnops.c b/sys/kern/spec_vnops.c
index e175105abc1..3af4f6c1a10 100644
--- a/sys/kern/spec_vnops.c
+++ b/sys/kern/spec_vnops.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: spec_vnops.c,v 1.49 2008/05/08 17:45:45 thib Exp $ */
+/* $OpenBSD: spec_vnops.c,v 1.50 2008/06/10 20:14:36 beck Exp $ */
/* $NetBSD: spec_vnops.c,v 1.29 1996/04/22 01:42:38 christos Exp $ */
/*
@@ -447,7 +447,7 @@ loop:
if ((bp->b_flags & B_DELWRI) == 0)
panic("spec_fsync: not dirty");
bremfree(bp);
- bp->b_flags |= B_BUSY;
+ buf_acquire(bp);
splx(s);
bawrite(bp);
goto loop;
diff --git a/sys/kern/subr_disk.c b/sys/kern/subr_disk.c
index 92a267e9c1d..fe111450c96 100644
--- a/sys/kern/subr_disk.c
+++ b/sys/kern/subr_disk.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: subr_disk.c,v 1.74 2008/05/23 00:51:33 krw Exp $ */
+/* $OpenBSD: subr_disk.c,v 1.75 2008/06/10 20:14:36 beck Exp $ */
/* $NetBSD: subr_disk.c,v 1.17 1996/03/16 23:17:08 christos Exp $ */
/*
@@ -399,7 +399,7 @@ readdoslabel(struct buf *bp, void (*strat)(struct buf *),
/* read boot record */
bp->b_blkno = part_blkno;
bp->b_bcount = lp->d_secsize;
- bp->b_flags = B_BUSY | B_READ;
+ bp->b_flags = B_BUSY | B_READ | B_RAW;
(*strat)(bp);
if (biowait(bp)) {
/*wrong*/ if (partoffp)
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
index 4d11266537c..f44253ef163 100644
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: vfs_bio.c,v 1.103 2008/03/16 19:42:57 otto Exp $ */
+/* $OpenBSD: vfs_bio.c,v 1.104 2008/06/10 20:14:36 beck Exp $ */
/* $NetBSD: vfs_bio.c,v 1.44 1996/06/11 11:15:36 pk Exp $ */
/*-
@@ -78,14 +78,12 @@ u_long bufhash;
/*
* Definitions for the buffer free lists.
*/
-#define BQUEUES 6 /* number of free buffer queues */
+#define BQUEUES 2 /* number of free buffer queues */
#define BQ_DIRTY 0 /* LRU queue with dirty buffers */
-
+#define BQ_CLEAN 1 /* LRU queue with clean buffers */
TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
-int bqpages[BQUEUES]; /* pages allocated, per queue */
-int bqpagelow;
int needbuffer;
struct bio_ops bioops;
@@ -93,7 +91,6 @@ struct bio_ops bioops;
* Buffer pool for I/O buffers.
*/
struct pool bufpool;
-struct vm_map *buf_map;
struct bufhead bufhead = LIST_HEAD_INITIALIZER(bufhead);
struct buf *buf_get(size_t);
struct buf *buf_stub(struct vnode *, daddr64_t);
@@ -107,7 +104,7 @@ void buf_put(struct buf *);
struct buf *bio_doread(struct vnode *, daddr64_t, int, int);
struct buf *getnewbuf(size_t, int, int, int *);
-void buf_init(struct buf *, int);
+void buf_init(struct buf *);
void bread_cluster_callback(struct buf *);
/*
@@ -117,54 +114,33 @@ void bread_cluster_callback(struct buf *);
* numdirtypages - number of pages on BQ_DIRTY queue.
* lodirtypages - low water mark for buffer cleaning daemon.
* hidirtypages - high water mark for buffer cleaning daemon.
- * numfreepages - number of pages on BQ_CLEAN and BQ_DIRTY queues. unused.
* numcleanpages - number of pages on BQ_CLEAN queue.
* Used to track the need to speedup the cleaner and
* as a reserve for special processes like syncer.
* maxcleanpages - the highest page count on BQ_CLEAN.
*/
-long numbufpages;
-long numdirtypages;
+
+struct bcachestats bcstats;
long lodirtypages;
long hidirtypages;
-long numfreepages;
-long numcleanpages;
long locleanpages;
long hicleanpages;
long maxcleanpages;
-struct proc *cleanerproc;
-int bd_req; /* Sleep point for cleaner daemon. */
+/* XXX - should be defined here. */
+extern int bufcachepercent;
-int size2cqueue(int *size);
+vsize_t bufkvm;
-int
-size2cqueue(int *size)
-{
- int i = 0, q;
- int s = *size;
- s -= 1;
- while (s > 0) {
- s = s >> 1;
- i++;
- }
- if (i < PAGE_SHIFT) {
- i = PAGE_SHIFT; /* < 4096 -> 4096 */
- }
- *size = 1 << i;
- q = (i + 1 - PAGE_SHIFT); /* XXX 4096 is queue 1 */
- if (q >= BQUEUES)
- panic("queue %d > BQUEUES %d", q, BQUEUES);
- if (q == 0)
- panic("can't return dirty q");
- return(q);
-}
+struct proc *cleanerproc;
+int bd_req; /* Sleep point for cleaner daemon. */
void
bremfree(struct buf *bp)
{
struct bqueues *dp = NULL;
- int queue;
+
+ splassert(IPL_BIO);
/*
* We only calculate the head of the freelist when removing
@@ -180,38 +156,26 @@ bremfree(struct buf *bp)
if (dp == &bufqueues[BQUEUES])
panic("bremfree: lost tail");
}
- numfreepages -= atop(bp->b_bufsize);
if (!ISSET(bp->b_flags, B_DELWRI)) {
- int qs = bp->b_bufsize;
- queue = size2cqueue(&qs);
- numcleanpages -= atop(bp->b_bufsize);
- bqpages[queue] -= atop(bp->b_bufsize);
- } else
- numdirtypages -= atop(bp->b_bufsize);
+ bcstats.numcleanpages -= atop(bp->b_bufsize);
+ } else {
+ bcstats.numdirtypages -= atop(bp->b_bufsize);
+ }
TAILQ_REMOVE(dp, bp, b_freelist);
+ bcstats.freebufs--;
}
void
-buf_init(struct buf *bp, int size)
+buf_init(struct buf *bp)
{
- int npages, queue;
-
splassert(IPL_BIO);
- npages = atop(size);
bzero((char *)bp, sizeof *bp);
bp->b_vnbufs.le_next = NOLIST;
bp->b_freelist.tqe_next = NOLIST;
bp->b_synctime = time_uptime + 300;
bp->b_dev = NODEV;
- queue = size2cqueue(&size);
LIST_INIT(&bp->b_dep);
- numbufpages += npages;
- numfreepages += npages;
- numcleanpages += npages;
- bqpages[queue] += npages;
- if (maxcleanpages < numcleanpages)
- maxcleanpages = numcleanpages;
}
/*
@@ -238,7 +202,7 @@ buf_stub(struct vnode *vp, daddr64_t lblkno)
bp->b_dev = NODEV;
bp->b_bufsize = 0;
bp->b_data = NULL;
- bp->b_flags = B_BUSY;
+ bp->b_flags = 0;
bp->b_dev = NODEV;
bp->b_blkno = bp->b_lblkno = lblkno;
bp->b_iodone = NULL;
@@ -250,8 +214,11 @@ buf_stub(struct vnode *vp, daddr64_t lblkno)
LIST_INIT(&bp->b_dep);
+ buf_acquire_unmapped(bp);
+
s = splbio();
LIST_INSERT_HEAD(&bufhead, bp, b_list);
+ bcstats.numbufs++;
bgetvp(vp, bp);
splx(s);
@@ -261,39 +228,31 @@ buf_stub(struct vnode *vp, daddr64_t lblkno)
struct buf *
buf_get(size_t size)
{
- struct bqueues *dp;
struct buf *bp;
int npages;
- int queue, qs;
- void *data;
splassert(IPL_BIO);
KASSERT(size > 0);
size = round_page(size);
- qs = size;
- queue = size2cqueue(&qs);
- npages = atop(qs);
+ npages = atop(size);
- if (numbufpages + npages > bufpages)
+ if (bcstats.numbufpages + npages > bufpages)
return (NULL);
bp = pool_get(&bufpool, PR_WAITOK);
- data = (void *)uvm_km_alloc(buf_map, qs);
- if (data == NULL) {
- pool_put(&bufpool, bp);
- return (NULL);
- }
- buf_init(bp, qs);
+ buf_init(bp);
bp->b_flags = B_INVAL;
- bp->b_bufsize = qs;
- bp->b_data = data;
- dp = &bufqueues[queue];
- binsheadfree(bp, dp);
+ buf_alloc_pages(bp, size);
+ bp->b_data = NULL;
+ binsheadfree(bp, &bufqueues[BQ_CLEAN]);
binshash(bp, &invalhash);
LIST_INSERT_HEAD(&bufhead, bp, b_list);
+ bcstats.numbufs++;
+ bcstats.freebufs++;
+ bcstats.numcleanpages += atop(bp->b_bufsize);
return (bp);
}
@@ -303,7 +262,7 @@ buf_put(struct buf *bp)
{
splassert(IPL_BIO);
#ifdef DIAGNOSTIC
- if (bp->b_data != NULL)
+ if (bp->b_pobj != NULL)
KASSERT(bp->b_bufsize > 0);
#endif
#ifdef DIAGNOSTIC
@@ -320,13 +279,10 @@ buf_put(struct buf *bp)
panic("buf_put: b_dep is not empty");
#endif
LIST_REMOVE(bp, b_list);
+ bcstats.numbufs--;
- if (bp->b_data != NULL) {
- bremhash(bp);
- numbufpages -= atop(bp->b_bufsize);
- uvm_km_free(buf_map, (vaddr_t)bp->b_data, bp->b_bufsize);
- }
-
+ if (buf_dealloc_mem(bp) != 0)
+ return;
pool_put(&bufpool, bp);
}
@@ -336,39 +292,56 @@ buf_put(struct buf *bp)
void
bufinit(void)
{
- vaddr_t minaddr, maxaddr;
struct bqueues *dp;
+ /* XXX - for now */
+ bufpages = bufcachepercent = bufkvm = 0;
+
+ /*
+ * If MD code doesn't say otherwise, use 10% of kvm for mappings and
+ * 10% physmem for pages.
+ */
+ if (bufcachepercent == 0)
+ bufcachepercent = 10;
+ if (bufpages == 0)
+ bufpages = physmem * bufcachepercent / 100;
+
+ if (bufkvm == 0)
+ bufkvm = (VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) / 10;
+
+ /*
+ * Don't use more than twice the amount of bufpages for mappings.
+ * It's twice since we map things sparsely.
+ */
+ if (bufkvm > bufpages * PAGE_SIZE)
+ bufkvm = bufpages * PAGE_SIZE;
+ /*
+ * Round bufkvm to MAXPHYS because we allocate chunks of va space
+ * in MAXPHYS chunks.
+ */
+ bufkvm &= ~(MAXPHYS - 1);
+
pool_init(&bufpool, sizeof(struct buf), 0, 0, 0, "bufpl", NULL);
pool_setipl(&bufpool, IPL_BIO);
for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
TAILQ_INIT(dp);
- minaddr = vm_map_min(kernel_map);
- buf_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
- ptoa(bufpages), 0, FALSE, NULL);
- /*
- * XXX don't starve any one queue below 5% of the total number
- * of buffer cache pages.
- */
- bqpagelow = bufpages / 20;
+ /*
+ * hmm - bufkvm is an argument because it's static, while
+ * bufpages is global because it can change while running.
+ */
+ buf_mem_init(bufkvm);
bufhashtbl = hashinit(bufpages / 4, M_CACHE, M_WAITOK, &bufhash);
hidirtypages = (bufpages / 4) * 3;
lodirtypages = bufpages / 2;
/*
- * Reserve 5% of bufpages for syncer's needs,
- * but not more than 25% and if possible
- * not less than 2 * MAXBSIZE. locleanpages
- * value must be not too small
+ * When we hit 95% of pages being clean, we bring them down to
+ * 90% to have some slack.
*/
- hicleanpages = bufpages / 2;
- locleanpages = hicleanpages / 2;
- if (locleanpages < atop(2 * MAXBSIZE))
- locleanpages = atop(2 * MAXBSIZE);
- if (locleanpages > bufpages / 4)
- locleanpages = bufpages / 4;
+ hicleanpages = bufpages - (bufpages / 20);
+ locleanpages = bufpages - (bufpages / 10);
maxcleanpages = locleanpages;
}
@@ -388,8 +361,9 @@ bio_doread(struct vnode *vp, daddr64_t blkno, int size, int async)
*/
if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) {
SET(bp->b_flags, B_READ | async);
+ bcstats.pendingreads++;
+ bcstats.numreads++;
VOP_STRATEGY(bp);
-
/* Pay for the read. */
curproc->p_stats->p_ru.ru_inblock++; /* XXX */
} else if (async) {
@@ -477,7 +451,7 @@ bread_cluster_callback(struct buf *bp)
}
free(xbpp, M_TEMP);
- bp->b_data = NULL;
+ bp->b_pobj = NULL;
buf_put(bp);
}
@@ -485,9 +459,8 @@ int
bread_cluster(struct vnode *vp, daddr64_t blkno, int size, struct buf **rbpp)
{
struct buf *bp, **xbpp;
- int howmany, i, maxra, inc;
+ int howmany, maxra, i, inc;
daddr64_t sblkno;
- size_t spill;
*rbpp = bio_doread(vp, blkno, size, 0);
@@ -544,11 +517,16 @@ bread_cluster(struct vnode *vp, daddr64_t blkno, int size, struct buf **rbpp)
inc = btodb(size);
for (i = 0; i < howmany; i++) {
+ bcstats.pendingreads++;
+ bcstats.numreads++;
SET(xbpp[i]->b_flags, B_READ | B_ASYNC);
binshash(xbpp[i], BUFHASH(vp, xbpp[i]->b_lblkno));
xbpp[i]->b_blkno = sblkno + (i * inc);
xbpp[i]->b_bufsize = xbpp[i]->b_bcount = size;
- xbpp[i]->b_data = bp->b_data + (i * size);
+ xbpp[i]->b_data = NULL;
+ xbpp[i]->b_pobj = bp->b_pobj;
+ xbpp[i]->b_poffs = bp->b_poffs + (i * size);
+ buf_acquire_unmapped(xbpp[i]);
}
bp->b_blkno = sblkno;
@@ -557,12 +535,8 @@ bread_cluster(struct vnode *vp, daddr64_t blkno, int size, struct buf **rbpp)
bp->b_saveaddr = (void *)xbpp;
bp->b_iodone = bread_cluster_callback;
bp->b_vp = vp;
- spill = bp->b_bufsize - bp->b_bcount;
- if (spill) {
- uvm_km_free(buf_map, (vaddr_t) bp->b_data + bp->b_bcount,
- spill);
- numbufpages -= atop(spill);
- }
+ bcstats.pendingreads++;
+ bcstats.numreads++;
VOP_STRATEGY(bp);
curproc->p_stats->p_ru.ru_inblock++;
@@ -609,6 +583,8 @@ bwrite(struct buf *bp)
else
mp->mnt_stat.f_syncwrites++;
}
+ bcstats.pendingwrites++;
+ bcstats.numwrites++;
wasdelayed = ISSET(bp->b_flags, B_DELWRI);
CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI));
@@ -723,6 +699,11 @@ buf_dirty(struct buf *bp)
{
splassert(IPL_BIO);
+#ifdef DIAGNOSTIC
+ if (!ISSET(bp->b_flags, B_BUSY))
+ panic("Trying to dirty buffer on freelist!");
+#endif
+
if (ISSET(bp->b_flags, B_DELWRI) == 0) {
SET(bp->b_flags, B_DELWRI);
bp->b_synctime = time_uptime + 35;
@@ -738,6 +719,10 @@ buf_undirty(struct buf *bp)
{
splassert(IPL_BIO);
+#ifdef DIAGNOSTIC
+ if (!ISSET(bp->b_flags, B_BUSY))
+ panic("Trying to undirty buffer on freelist!");
+#endif
if (ISSET(bp->b_flags, B_DELWRI)) {
CLR(bp->b_flags, B_DELWRI);
reassignbuf(bp);
@@ -769,8 +754,6 @@ brelse(struct buf *bp)
SET(bp->b_flags, B_INVAL);
if (ISSET(bp->b_flags, B_INVAL)) {
- int queue, qs;
-
/*
* If the buffer is invalid, place it in the clean queue, so it
* can be reused.
@@ -789,37 +772,29 @@ brelse(struct buf *bp)
* If the buffer has no associated data, place it back in the
* pool.
*/
- if (bp->b_data == NULL) {
+ if (bp->b_data == NULL && bp->b_pobj == NULL) {
buf_put(bp);
splx(s);
return;
}
- qs = bp->b_bufsize;
- queue = size2cqueue(&qs);
- numcleanpages += atop(bp->b_bufsize);
- bqpages[queue] += atop(bp->b_bufsize);
- if (maxcleanpages < numcleanpages)
- maxcleanpages = numcleanpages;
- binsheadfree(bp, &bufqueues[queue]);
+ bcstats.numcleanpages += atop(bp->b_bufsize);
+ if (maxcleanpages < bcstats.numcleanpages)
+ maxcleanpages = bcstats.numcleanpages;
+ binsheadfree(bp, &bufqueues[BQ_CLEAN]);
} else {
/*
* It has valid data. Put it on the end of the appropriate
* queue, so that it'll stick around for as long as possible.
*/
- int queue, qs;
- numfreepages += atop(bp->b_bufsize);
- qs = bp->b_bufsize;
- queue = size2cqueue(&qs);
if (!ISSET(bp->b_flags, B_DELWRI)) {
- numcleanpages += atop(bp->b_bufsize);
- bqpages[queue] += atop(bp->b_bufsize);
- if (maxcleanpages < numcleanpages)
- maxcleanpages = numcleanpages;
- bufq = &bufqueues[queue];
+ bcstats.numcleanpages += atop(bp->b_bufsize);
+ if (maxcleanpages < bcstats.numcleanpages)
+ maxcleanpages = bcstats.numcleanpages;
+ bufq = &bufqueues[BQ_CLEAN];
} else {
- numdirtypages += atop(bp->b_bufsize);
+ bcstats.numdirtypages += atop(bp->b_bufsize);
bufq = &bufqueues[BQ_DIRTY];
}
if (ISSET(bp->b_flags, B_AGE)) {
@@ -832,7 +807,9 @@ brelse(struct buf *bp)
}
/* Unlock the buffer. */
- CLR(bp->b_flags, (B_AGE | B_ASYNC | B_BUSY | B_NOCACHE | B_DEFERRED));
+ bcstats.freebufs++;
+ CLR(bp->b_flags, (B_AGE | B_ASYNC | B_NOCACHE | B_DEFERRED));
+ buf_release(bp);
/* Wake up any processes waiting for any buffer to become free. */
if (needbuffer) {
@@ -917,8 +894,10 @@ start:
}
if (!ISSET(bp->b_flags, B_INVAL)) {
- SET(bp->b_flags, (B_BUSY | B_CACHE));
+ bcstats.cachehits++;
bremfree(bp);
+ SET(bp->b_flags, B_CACHE);
+ buf_acquire(bp);
splx(s);
break;
}
@@ -946,6 +925,10 @@ start:
bgetvp(vp, bp);
splx(s);
}
+#ifdef DIAGNOSTIC
+ if (!ISSET(bp->b_flags, B_BUSY))
+ panic("getblk buffer not B_BUSY");
+#endif
return (bp);
}
@@ -972,7 +955,7 @@ struct buf *
getnewbuf(size_t size, int slpflag, int slptimeo, int *ep)
{
struct buf *bp;
- int s, error, queue, qs;
+ int s;
#if 0 /* we would really like this but sblock update kills it */
KASSERT(curproc != syncerproc && curproc != cleanerproc);
@@ -982,72 +965,47 @@ getnewbuf(size_t size, int slpflag, int slptimeo, int *ep)
/*
* Wake up cleaner if we're getting low on pages.
*/
- if (numdirtypages >= hidirtypages || numcleanpages <= locleanpages)
+ if (bcstats.numdirtypages >= hidirtypages || bcstats.numcleanpages <= locleanpages)
wakeup(&bd_req);
+ /*
+ * If we're above the high water mark for clean pages,
+ * free down to the low water mark.
+ */
+ if (bcstats.numcleanpages > hicleanpages) {
+ while (bcstats.numcleanpages > locleanpages) {
+ bp = TAILQ_FIRST(&bufqueues[BQ_CLEAN]);
+ bremfree(bp);
+ if (bp->b_vp)
+ brelvp(bp);
+ bremhash(bp);
+ buf_put(bp);
+ }
+ }
+
/* we just ask. it can say no.. */
getsome:
- qs = size;
- queue = size2cqueue(&qs);
- bp = buf_get(qs); /* XXX use qs instead and no need in buf_get? */
+ bp = buf_get(size);
if (bp == NULL) {
- /*
- * No free ones, try to reuse a clean one of the same or
- * larger size.
- */
- do {
- bp = TAILQ_FIRST(&bufqueues[queue]);
- queue++;
- } while (bp == NULL && queue < BQUEUES);
- }
- if (bp == NULL) {
- /* we couldn't reuse a free one, nothing of the right size */
- /* XXX free 20 buffers per q - ugly hack should really
- * reuse big ones without truncating. fix later
- */
- int q, gotsome = 0;
- int freemax = 20;
- for (q = 1; q < BQUEUES; q++) {
- int i = freemax;
- while (bqpages[q] > bqpagelow
- && (bp = TAILQ_FIRST(&bufqueues[q]))
- && i--) {
- gotsome++;
- bremfree(bp);
- if (LIST_FIRST(&bp->b_dep) != NULL)
- buf_deallocate(bp);
-
- if (ISSET(bp->b_flags, B_DELWRI)) {
- CLR(bp->b_flags, B_DELWRI);
- }
-
- if (bp->b_vp)
- brelvp(bp);
-
- buf_put(bp);
- }
+ int freemax = 5;
+ int i = freemax;
+ while ((bp = TAILQ_FIRST(&bufqueues[BQ_CLEAN])) && i--) {
+ bremfree(bp);
+ if (bp->b_vp)
+ brelvp(bp);
+ bremhash(bp);
+ buf_put(bp);
}
- if (gotsome)
+ if (freemax != i)
goto getsome;
- }
- if (bp == NULL) {
- /* wait for a free buffer of any kind */
- needbuffer++;
- error = tsleep(&needbuffer, slpflag | (PRIBIO + 1),
- "getnewbuf", slptimeo);
- if (ep != NULL) {
- *ep = error;
- if (error) {
- splx(s);
- return (NULL);
- }
- }
- goto getsome;
+ splx(s);
+ return (NULL);
}
bremfree(bp);
/* Buffer is no longer on free lists. */
- SET(bp->b_flags, B_BUSY);
+ bp->b_flags = 0;
+ buf_acquire(bp);
#ifdef DIAGNOSTIC
if (ISSET(bp->b_flags, B_DELWRI))
@@ -1067,7 +1025,6 @@ getsome:
#endif
/* clear out various other fields */
- bp->b_flags = B_BUSY;
bp->b_dev = NODEV;
bp->b_blkno = bp->b_lblkno = 0;
bp->b_iodone = NULL;
@@ -1095,8 +1052,7 @@ buf_daemon(struct proc *p)
s = splbio();
for (;;) {
- if (!numdirtypages ||
- (numdirtypages < hidirtypages && !needbuffer))
+ if (bcstats.numdirtypages < hidirtypages)
tsleep(&bd_req, PRIBIO - 7, "cleaner", 0);
getmicrouptime(&starttime);
@@ -1104,11 +1060,11 @@ buf_daemon(struct proc *p)
while ((bp = TAILQ_FIRST(&bufqueues[BQ_DIRTY]))) {
struct timeval tv;
- if (numdirtypages < lodirtypages && !needbuffer)
+ if (bcstats.numdirtypages < lodirtypages)
break;
bremfree(bp);
- SET(bp->b_flags, B_BUSY);
+ buf_acquire(bp);
splx(s);
if (ISSET(bp->b_flags, B_INVAL)) {
@@ -1125,10 +1081,10 @@ buf_daemon(struct proc *p)
buf_countdeps(bp, 0, 0)) {
SET(bp->b_flags, B_DEFERRED);
s = splbio();
- numfreepages += atop(bp->b_bufsize);
- numdirtypages += atop(bp->b_bufsize);
+ bcstats.numdirtypages += atop(bp->b_bufsize);
binstailfree(bp, &bufqueues[BQ_DIRTY]);
- CLR(bp->b_flags, B_BUSY);
+ bcstats.freebufs++;
+ buf_release(bp);
continue;
}
@@ -1154,6 +1110,8 @@ biowait(struct buf *bp)
{
int s;
+ KASSERT(!(bp->b_flags & B_ASYNC));
+
s = splbio();
while (!ISSET(bp->b_flags, B_DONE))
tsleep(bp, PRIBIO + 1, "biowait", 0);
@@ -1203,8 +1161,11 @@ biodone(struct buf *bp)
if (!ISSET(bp->b_flags, B_READ)) {
CLR(bp->b_flags, B_WRITEINPROG);
+ bcstats.pendingwrites--;
vwakeup(bp->b_vp);
- }
+ } else if (bcstats.numbufs &&
+ (!(ISSET(bp->b_flags, B_RAW) || ISSET(bp->b_flags, B_PHYS))))
+ bcstats.pendingreads--;
if (ISSET(bp->b_flags, B_CALL)) { /* if necessary, call out */
CLR(bp->b_flags, B_CALL); /* but note callout done */
@@ -1218,66 +1179,3 @@ biodone(struct buf *bp)
}
}
}
-
-#if 1
-void
-vfs_bufstats(void) {
- return;
-}
-/* #ifdef DDB */
-#else
-/*
- * Print out statistics on the current allocation of the buffer pool.
- * Can be enabled to print out on every ``sync'' by setting "syncprt"
- * in vfs_syscalls.c using sysctl.
- */
-void
-vfs_bufstats(void)
-{
- int s, i, j, count;
- struct buf *bp;
- struct bqueues *dp;
- int counts[MAXBSIZE/PAGE_SIZE+1];
- int totals[BQUEUES];
- long ptotals[BQUEUES];
- long pages;
- static char *bname[BQUEUES] = { "CLEAN", "DIRTY", "EMPTY" };
-
- s = splbio();
- for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
- count = 0;
- pages = 0;
- for (j = 0; j <= MAXBSIZE/PAGE_SIZE; j++)
- counts[j] = 0;
- TAILQ_FOREACH(bp, dp, b_freelist) {
- counts[bp->b_bufsize/PAGE_SIZE]++;
- count++;
- pages += atop(bp->b_bufsize);
- }
- totals[i] = count;
- ptotals[i] = pages;
- printf("%s: total-%d(%d pages)", bname[i], count, pages);
- for (j = 0; j <= MAXBSIZE/PAGE_SIZE; j++)
- if (counts[j] != 0)
- printf(", %d-%d", j * PAGE_SIZE, counts[j]);
- printf("\n");
- }
- if ((ptotals[BQ_CLEAN] + ptotals[BQ_DIRTY]) != numfreepages)
- printf("numfreepages counter wrong: %ld != %ld\n",
- numfreepages, ptotals[BQ_CLEAN] + ptotals[BQ_DIRTY]);
- if (ptotals[BQ_CLEAN] != numcleanpages)
- printf("numcleanpages counter wrong: %ld != %ld\n",
- numcleanpages, ptotals[<BQ_CLEAN]);
- else
- printf("numcleanpages: %ld\n", numcleanpages);
- if (numdirtypages != ptotals[BQ_DIRTY])
- printf("numdirtypages counter wrong: %ld != %ld\n",
- numdirtypages, ptotals[BQ_DIRTY]);
- else
- printf("numdirtypages: %ld\n", numdirtypages);
-
- printf("syncer eating up to %ld pages from %ld reserved\n",
- maxcleanpages - hicleanpages, locleanpages);
- splx(s);
-}
-#endif /* DEBUG */
diff --git a/sys/kern/vfs_biomem.c b/sys/kern/vfs_biomem.c
new file mode 100644
index 00000000000..ccda55290fb
--- /dev/null
+++ b/sys/kern/vfs_biomem.c
@@ -0,0 +1,305 @@
+/*
+ * Copyright (c) 2007 Artur Grabowski <art@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/buf.h>
+#include <sys/pool.h>
+#include <sys/mount.h>
+
+#include <uvm/uvm_extern.h>
+#include <uvm/uvm.h>
+
+vaddr_t buf_kva_start, buf_kva_end;
+int buf_needva;
+TAILQ_HEAD(,buf) buf_valist;
+
+int buf_nkvmsleep;
+
+extern struct bcachestats bcstats;
+
+/*
+ * Pages are allocated from a uvm object (we only use it for page storage,
+ * all pages are wired). Since every buffer contains a contiguous range of
+ * pages, reusing the pages could be very painful. Fortunately voff_t is
+ * 64 bits, so we can just increment buf_page_offset all the time and ignore
+ * wraparound. Even if you reuse 4GB worth of buffers every second
+ * you'll still run out of time_t faster than buffers.
+ *
+ * XXX - the spl locking in here is extreme paranoia right now until I figure
+ * it all out.
+ */
+voff_t buf_page_offset;
+struct uvm_object *buf_object, buf_object_store;
+
+vaddr_t buf_unmap(struct buf *);
+
+void
+buf_mem_init(vsize_t size)
+{
+ TAILQ_INIT(&buf_valist);
+
+ buf_kva_start = vm_map_min(kernel_map);
+ if (uvm_map(kernel_map, &buf_kva_start, size, NULL,
+ UVM_UNKNOWN_OFFSET, PAGE_SIZE, UVM_MAPFLAG(UVM_PROT_NONE,
+ UVM_PROT_NONE, UVM_INH_NONE, UVM_ADV_NORMAL, 0)))
+ panic("bufinit: can't reserve VM for buffers");
+ buf_kva_end = buf_kva_start + size;
+
+ buf_object = &buf_object_store;
+
+ buf_object->pgops = NULL;
+ TAILQ_INIT(&buf_object->memq);
+ buf_object->uo_npages = 0;
+ buf_object->uo_refs = 1;
+}
+
+/*
+ * buf_acquire and buf_release manage the kvm mappings of buffers.
+ */
+void
+buf_acquire(struct buf *bp)
+{
+ vaddr_t va;
+ int s;
+
+ KASSERT((bp->b_flags & B_BUSY) == 0);
+
+ s = splbio();
+ /*
+ * Busy before waiting for kvm.
+ */
+ SET(bp->b_flags, B_BUSY);
+
+ if (bp->b_data == NULL) {
+ unsigned long i;
+
+ /*
+ * First, just use the pre-allocated space until we run out.
+ */
+ if (buf_kva_start < buf_kva_end) {
+ va = buf_kva_start;
+ buf_kva_start += MAXPHYS;
+ } else {
+ struct buf *vbp;
+
+ /*
+ * Find some buffer we can steal the space from.
+ */
+ while ((vbp = TAILQ_FIRST(&buf_valist)) == NULL) {
+ buf_needva++;
+ buf_nkvmsleep++;
+ tsleep(&buf_needva, PRIBIO, "buf_needva", 0);
+ }
+ va = buf_unmap(vbp);
+ }
+
+ for (i = 0; i < atop(bp->b_bufsize); i++) {
+ struct vm_page *pg = uvm_pagelookup(bp->b_pobj,
+ bp->b_poffs + ptoa(i));
+
+ KASSERT(pg != NULL);
+
+ pmap_kenter_pa(va + ptoa(i), VM_PAGE_TO_PHYS(pg),
+ VM_PROT_READ|VM_PROT_WRITE);
+ pmap_update(pmap_kernel());
+ }
+ bp->b_data = (caddr_t)va;
+ } else {
+ TAILQ_REMOVE(&buf_valist, bp, b_valist);
+ }
+ splx(s);
+}
+
+/*
+ * Busy a buffer, but don't map it.
+ * If it has a mapping, we keep it, but we also keep the mapping on
+ * the list since we assume that it won't be used anymore.
+ */
+void
+buf_acquire_unmapped(struct buf *bp)
+{
+ int s;
+
+ s = splbio();
+ SET(bp->b_flags, B_BUSY|B_NOTMAPPED);
+ splx(s);
+}
+
+void
+buf_release(struct buf *bp)
+{
+ int s;
+
+ KASSERT(bp->b_flags & B_BUSY);
+ KASSERT((bp->b_data != NULL) || (bp->b_flags & B_NOTMAPPED));
+
+ s = splbio();
+ if (bp->b_data) {
+ TAILQ_INSERT_TAIL(&buf_valist, bp, b_valist);
+ if (buf_needva) {
+ buf_needva--;
+ wakeup_one(&buf_needva);
+ }
+ }
+ CLR(bp->b_flags, B_BUSY|B_NOTMAPPED);
+ splx(s);
+}
+
+/*
+ * Deallocate all memory resources for this buffer. We need to be careful
+ * to not drop kvm since we have no way to reclaim it. So, if the buffer
+ * has kvm, we need to free it later. We put it on the front of the
+ * freelist just so it gets picked up faster.
+ *
+ * Also, lots of assertions count on bp->b_data being NULL, so we
+ * set it temporarily to NULL.
+ *
+ * Return non-zero if we take care of the freeing later.
+ */
+int
+buf_dealloc_mem(struct buf *bp)
+{
+ caddr_t data = bp->b_data;
+ int s;
+
+ s = splbio();
+
+ bp->b_data = NULL;
+
+ if (bp->b_pobj) {
+ if (data) {
+ pmap_kremove((vaddr_t)data, bp->b_bufsize);
+ pmap_update(pmap_kernel());
+ }
+ buf_free_pages(bp);
+ }
+
+ if (data == NULL) {
+ splx(s);
+ return (0);
+ }
+
+ bp->b_data = data;
+ if (!(bp->b_flags & B_BUSY)) /* XXX - need better test */
+ TAILQ_REMOVE(&buf_valist, bp, b_valist);
+ else
+ CLR(bp->b_flags, B_BUSY);
+ SET(bp->b_flags, B_RELEASED);
+ TAILQ_INSERT_HEAD(&buf_valist, bp, b_valist);
+
+ splx(s);
+
+ return (1);
+}
+
+vaddr_t
+buf_unmap(struct buf *bp)
+{
+ vaddr_t va;
+ int s;
+
+ KASSERT((bp->b_flags & B_BUSY) == 0);
+ KASSERT(bp->b_data != NULL);
+
+ s = splbio();
+ TAILQ_REMOVE(&buf_valist, bp, b_valist);
+ va = (vaddr_t)bp->b_data;
+ bp->b_data = 0;
+ pmap_kremove(va, bp->b_bufsize);
+ pmap_update(pmap_kernel());
+
+ if (bp->b_flags & B_RELEASED)
+ pool_put(&bufpool, bp);
+
+ splx(s);
+
+ return (va);
+}
+
+void
+buf_alloc_pages(struct buf *bp, vsize_t size)
+{
+ struct vm_page *pg;
+ voff_t offs, i;
+ int s;
+
+ KASSERT(size == round_page(size));
+ KASSERT(bp->b_pobj == NULL);
+ KASSERT(bp->b_data == NULL);
+
+ s = splbio();
+
+ offs = buf_page_offset;
+ buf_page_offset += size;
+
+ KASSERT(buf_page_offset > 0);
+
+ for (i = 0; i < atop(size); i++) {
+#if defined(DEBUG) || 1
+ if ((pg = uvm_pagelookup(buf_object, offs + ptoa(i))))
+ panic("buf_alloc_pages: overlap buf: %p page: %p",
+ bp, pg);
+#endif
+
+ while ((pg = uvm_pagealloc(buf_object, offs + ptoa(i),
+ NULL, 0)) == NULL) {
+ uvm_wait("buf_alloc_pages");
+ }
+ pg->wire_count = 1;
+ atomic_clearbits_int(&pg->pg_flags, PG_BUSY);
+ bcstats.numbufpages++;
+ }
+
+ bp->b_pobj = buf_object;
+ bp->b_poffs = offs;
+ bp->b_bufsize = size;
+ splx(s);
+}
+
+void
+buf_free_pages(struct buf *bp)
+{
+ struct uvm_object *uobj = bp->b_pobj;
+ struct vm_page *pg;
+ voff_t off, i;
+ int s;
+
+ KASSERT(bp->b_data == NULL);
+ KASSERT(uobj != NULL);
+
+ s = splbio();
+
+ off = bp->b_poffs;
+ bp->b_pobj = NULL;
+ bp->b_poffs = 0;
+
+ for (i = 0; i < (bp->b_bufsize >> PAGE_SHIFT); i++) {
+ pg = uvm_pagelookup(uobj, off + (i * PAGE_SIZE));
+ KASSERT(pg != NULL);
+ KASSERT(pg->wire_count == 1);
+ pg->wire_count = 0;
+ uvm_pagefree(pg);
+ bcstats.numbufpages--;
+ }
+ splx(s);
+}
+
+/*
+ * XXX - it might make sense to make a buf_realloc_pages to avoid
+ * bouncing through the free list all the time.
+ */
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index 3ac320e5bc1..65153b0cdfc 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: vfs_subr.c,v 1.167 2008/06/09 23:38:37 millert Exp $ */
+/* $OpenBSD: vfs_subr.c,v 1.168 2008/06/10 20:14:36 beck Exp $ */
/* $NetBSD: vfs_subr.c,v 1.53 1996/04/22 01:39:13 christos Exp $ */
/*
@@ -120,7 +120,7 @@ void
vntblinit(void)
{
/* buffer cache may need a vnode for each buffer */
- maxvnodes = desiredvnodes;
+ maxvnodes = bufpages;
pool_init(&vnode_pool, sizeof(struct vnode), 0, 0, 0, "vnodes",
&pool_allocator_nointr);
TAILQ_INIT(&vnode_hold_list);
@@ -1256,8 +1256,12 @@ vfs_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
free(tmpvfsp, M_TEMP);
return (ret);
- }
+ case VFS_BCACHESTAT: /* buffer cache statistics */
+ ret = sysctl_rdstruct(oldp, oldlenp, newp, &bcstats,
+ sizeof(struct bcachestats));
+ return(ret);
+ }
return (EOPNOTSUPP);
}
@@ -1664,7 +1668,7 @@ vfs_syncwait(int verbose)
if (bp->b_flags & B_DELWRI) {
s = splbio();
bremfree(bp);
- bp->b_flags |= B_BUSY;
+ buf_acquire(bp);
splx(s);
nbusy++;
bawrite(bp);
@@ -1835,7 +1839,7 @@ loop:
break;
}
bremfree(bp);
- bp->b_flags |= B_BUSY;
+ buf_acquire(bp);
/*
* XXX Since there are no node locks for NFS, I believe
* there is a slight chance that a delayed write will
@@ -1873,7 +1877,7 @@ loop:
if ((bp->b_flags & B_DELWRI) == 0)
panic("vflushbuf: not dirty");
bremfree(bp);
- bp->b_flags |= B_BUSY;
+ buf_acquire(bp);
splx(s);
/*
* Wait for I/O associated with indirect blocks to complete,
diff --git a/sys/miscfs/specfs/spec_vnops.c b/sys/miscfs/specfs/spec_vnops.c
index e175105abc1..3af4f6c1a10 100644
--- a/sys/miscfs/specfs/spec_vnops.c
+++ b/sys/miscfs/specfs/spec_vnops.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: spec_vnops.c,v 1.49 2008/05/08 17:45:45 thib Exp $ */
+/* $OpenBSD: spec_vnops.c,v 1.50 2008/06/10 20:14:36 beck Exp $ */
/* $NetBSD: spec_vnops.c,v 1.29 1996/04/22 01:42:38 christos Exp $ */
/*
@@ -447,7 +447,7 @@ loop:
if ((bp->b_flags & B_DELWRI) == 0)
panic("spec_fsync: not dirty");
bremfree(bp);
- bp->b_flags |= B_BUSY;
+ buf_acquire(bp);
splx(s);
bawrite(bp);
goto loop;
diff --git a/sys/nfs/nfs_syscalls.c b/sys/nfs/nfs_syscalls.c
index 5b29ccd0935..007933590d7 100644
--- a/sys/nfs/nfs_syscalls.c
+++ b/sys/nfs/nfs_syscalls.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: nfs_syscalls.c,v 1.59 2008/05/02 13:26:27 thib Exp $ */
+/* $OpenBSD: nfs_syscalls.c,v 1.60 2008/06/10 20:14:37 beck Exp $ */
/* $NetBSD: nfs_syscalls.c,v 1.19 1996/02/18 11:53:52 fvdl Exp $ */
/*
@@ -748,7 +748,8 @@ nfssvc_iod(p)
(B_BUSY|B_DELWRI|B_NEEDCOMMIT|B_NOCACHE))!=B_DELWRI)
continue;
bremfree(nbp);
- nbp->b_flags |= (B_BUSY|B_ASYNC);
+ nbp->b_flags |= B_ASYNC;
+ buf_acquire(nbp);
break;
}
/*
diff --git a/sys/nfs/nfs_vnops.c b/sys/nfs/nfs_vnops.c
index be30de9f5f4..d26824f8b0e 100644
--- a/sys/nfs/nfs_vnops.c
+++ b/sys/nfs/nfs_vnops.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: nfs_vnops.c,v 1.84 2008/06/09 23:38:37 millert Exp $ */
+/* $OpenBSD: nfs_vnops.c,v 1.85 2008/06/10 20:14:37 beck Exp $ */
/* $NetBSD: nfs_vnops.c,v 1.62.4.1 1996/07/08 20:26:52 jtc Exp $ */
/*
@@ -2675,7 +2675,8 @@ again:
!= (B_DELWRI | B_NEEDCOMMIT))
continue;
bremfree(bp);
- bp->b_flags |= (B_BUSY | B_WRITEINPROG);
+ bp->b_flags |= B_WRITEINPROG;
+ buf_acquire(bp);
/*
* A list of these buffers is kept so that the
* second loop knows which buffers have actually
@@ -2753,10 +2754,12 @@ loop:
if ((passone || !commit) && (bp->b_flags & B_NEEDCOMMIT))
continue;
bremfree(bp);
- if (passone || !commit)
- bp->b_flags |= (B_BUSY|B_ASYNC);
- else
- bp->b_flags |= (B_BUSY|B_ASYNC|B_WRITEINPROG|B_NEEDCOMMIT);
+ if (passone || !commit) {
+ bp->b_flags |= B_ASYNC;
+ } else {
+ bp->b_flags |= (B_ASYNC|B_WRITEINPROG|B_NEEDCOMMIT);
+ }
+ buf_acquire(bp);
splx(s);
VOP_BWRITE(bp);
goto loop;
@@ -2952,6 +2955,8 @@ nfs_writebp(bp, force)
if (retv) {
if (force)
bp->b_flags |= B_WRITEINPROG;
+ bcstats.pendingwrites++;
+ bcstats.numwrites++;
VOP_STRATEGY(bp);
}
diff --git a/sys/scsi/cd.c b/sys/scsi/cd.c
index 52cde028c4c..bbc1bbb25ba 100644
--- a/sys/scsi/cd.c
+++ b/sys/scsi/cd.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: cd.c,v 1.135 2008/05/27 11:39:22 fgsch Exp $ */
+/* $OpenBSD: cd.c,v 1.136 2008/06/10 20:14:37 beck Exp $ */
/* $NetBSD: cd.c,v 1.100 1997/04/02 02:29:30 mycroft Exp $ */
/*
@@ -82,6 +82,8 @@
#define CD_FRAMES 75
#define CD_SECS 60
+#define CD_LOCKED 0x0800
+
struct cd_toc {
struct ioc_toc_header header;
struct cd_toc_entry entries[MAXTRACK+1]; /* One extra for the */
diff --git a/sys/sys/buf.h b/sys/sys/buf.h
index c301148a9b2..50edefe61f4 100644
--- a/sys/sys/buf.h
+++ b/sys/sys/buf.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: buf.h,v 1.57 2007/05/28 18:08:47 pedro Exp $ */
+/* $OpenBSD: buf.h,v 1.58 2008/06/10 20:14:37 beck Exp $ */
/* $NetBSD: buf.h,v 1.25 1997/04/09 21:12:17 mycroft Exp $ */
/*
@@ -87,6 +87,12 @@ struct buf {
dev_t b_dev; /* Device associated with buffer. */
caddr_t b_data; /* associated data */
void *b_saveaddr; /* Original b_data for physio. */
+
+ TAILQ_ENTRY(buf) b_valist; /* LRU of va to reuse. */
+
+ struct uvm_object *b_pobj; /* Object containing the pages */
+ off_t b_poffs; /* Offset within object */
+
daddr64_t b_lblkno; /* Logical block number. */
daddr64_t b_blkno; /* Underlying physical block number. */
/* Function to call upon completion.
@@ -161,6 +167,8 @@ struct buf *bufq_default_get(struct bufq *);
#define B_DEFERRED 0x04000000 /* Skipped over for cleaning */
#define B_SCANNED 0x08000000 /* Block already pushed during sync */
#define B_PDAEMON 0x10000000 /* I/O started by pagedaemon */
+#define B_RELEASED 0x20000000 /* free this buffer after its kvm */
+#define B_NOTMAPPED 0x40000000 /* BUSY, but not necessarily mapped */
#define B_BITS "\010\001AGE\002NEEDCOMMIT\003ASYNC\004BAD\005BUSY\006CACHE" \
"\007CALL\010DELWRI\012DONE\013EINTR\014ERROR" \
@@ -228,6 +236,23 @@ struct buf *getblk(struct vnode *, daddr64_t, int, int, int);
struct buf *geteblk(int);
struct buf *incore(struct vnode *, daddr64_t);
+/*
+ * buf_kvm_init initializes the kvm handling for buffers.
+ * buf_acquire sets the B_BUSY flag and ensures that the buffer is
+ * mapped in the kvm.
+ * buf_release clears the B_BUSY flag and allows the buffer to become
+ * unmapped.
+ * buf_unmap is for internal use only. Unmaps the buffer from kvm.
+ */
+void buf_mem_init(vsize_t);
+void buf_acquire(struct buf *);
+void buf_acquire_unmapped(struct buf *);
+void buf_release(struct buf *);
+int buf_dealloc_mem(struct buf *);
+void buf_alloc_pages(struct buf *, vsize_t);
+void buf_free_pages(struct buf *);
+
+
void minphys(struct buf *bp);
int physio(void (*strategy)(struct buf *), struct buf *bp, dev_t dev,
int flags, void (*minphys)(struct buf *), struct uio *uio);
diff --git a/sys/sys/mount.h b/sys/sys/mount.h
index 09a6d67ee2f..64e01badfef 100644
--- a/sys/sys/mount.h
+++ b/sys/sys/mount.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: mount.h,v 1.84 2008/05/07 14:09:36 thib Exp $ */
+/* $OpenBSD: mount.h,v 1.85 2008/06/10 20:14:37 beck Exp $ */
/* $NetBSD: mount.h,v 1.48 1996/02/18 11:55:47 fvdl Exp $ */
/*
@@ -465,10 +465,13 @@ struct mount {
#define VFS_MAXTYPENUM 1 /* int: highest defined filesystem type */
#define VFS_CONF 2 /* struct: vfsconf for filesystem given
as next argument */
+#define VFS_BCACHESTAT 3 /* struct: buffer cache statistics given
+ as next argument */
#define CTL_VFSGENCTL_NAMES { \
{ 0, 0 }, \
{ "maxtypenum", CTLTYPE_INT }, \
- { "conf", CTLTYPE_NODE } \
+ { "conf", CTLTYPE_NODE }, \
+ { "bcachestat", CTLTYPE_STRUCT } \
}
/*
@@ -485,6 +488,24 @@ struct vfsconf {
struct vfsconf *vfc_next; /* next in list */
};
+/* buffer cache statistics */
+struct bcachestats {
+ long numbufs; /* number of buffers allocated */
+ long freebufs; /* number of free buffers */
+ long numbufpages; /* number of pages in buffer cache */
+ long numfreepages; /* number of free pages */
+ long numdirtypages; /* number of dirty free pages */
+ long numcleanpages; /* number of clean free pages */
+ long pendingwrites; /* number of pending writes */
+ long pendingreads; /* number of pending reads */
+ long numwrites; /* total writes started */
+ long numreads; /* total reads started */
+ long cachehits; /* total reads found in cache */
+};
+#ifdef _KERNEL
+extern struct bcachestats bcstats;
+#endif
+
/*
* Operations supported on mounted file system.
*/
diff --git a/sys/ufs/ext2fs/ext2fs_bmap.c b/sys/ufs/ext2fs/ext2fs_bmap.c
index dc7acb7d213..5de33f44a8f 100644
--- a/sys/ufs/ext2fs/ext2fs_bmap.c
+++ b/sys/ufs/ext2fs/ext2fs_bmap.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: ext2fs_bmap.c,v 1.15 2007/06/17 20:15:25 jasper Exp $ */
+/* $OpenBSD: ext2fs_bmap.c,v 1.16 2008/06/10 20:14:37 beck Exp $ */
/* $NetBSD: ext2fs_bmap.c,v 1.5 2000/03/30 12:41:11 augustss Exp $ */
/*
@@ -190,6 +190,7 @@ ext2fs_bmaparray(struct vnode *vp, int32_t bn, daddr64_t *bnp,
bp->b_flags |= B_READ;
VOP_STRATEGY(bp);
curproc->p_stats->p_ru.ru_inblock++; /* XXX */
+ bcstats.pendingreads++;
if ((error = biowait(bp)) != 0) {
brelse(bp);
return (error);
diff --git a/sys/ufs/ext2fs/ext2fs_inode.c b/sys/ufs/ext2fs/ext2fs_inode.c
index a78632ce518..a974b9f0874 100644
--- a/sys/ufs/ext2fs/ext2fs_inode.c
+++ b/sys/ufs/ext2fs/ext2fs_inode.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: ext2fs_inode.c,v 1.39 2007/10/29 17:06:20 chl Exp $ */
+/* $OpenBSD: ext2fs_inode.c,v 1.40 2008/06/10 20:14:37 beck Exp $ */
/* $NetBSD: ext2fs_inode.c,v 1.24 2001/06/19 12:59:18 wiz Exp $ */
/*
@@ -466,6 +466,8 @@ ext2fs_indirtrunc(struct inode *ip, int32_t lbn, int32_t dbn, int32_t lastbn, in
bp = getblk(vp, lbn, (int)fs->e2fs_bsize, 0, 0);
if (!(bp->b_flags & (B_DONE | B_DELWRI))) {
curproc->p_stats->p_ru.ru_inblock++; /* pay for read */
+ bcstats.pendingreads++;
+ bcstats.numreads++;
bp->b_flags |= B_READ;
if (bp->b_bcount > bp->b_bufsize)
panic("ext2fs_indirtrunc: bad buffer size");
diff --git a/sys/ufs/ffs/ffs_inode.c b/sys/ufs/ffs/ffs_inode.c
index 6105983adbf..4021a9da0fc 100644
--- a/sys/ufs/ffs/ffs_inode.c
+++ b/sys/ufs/ffs/ffs_inode.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: ffs_inode.c,v 1.52 2008/01/05 19:49:26 otto Exp $ */
+/* $OpenBSD: ffs_inode.c,v 1.53 2008/06/10 20:14:37 beck Exp $ */
/* $NetBSD: ffs_inode.c,v 1.10 1996/05/11 18:27:19 mycroft Exp $ */
/*
@@ -501,6 +501,8 @@ ffs_indirtrunc(struct inode *ip, daddr64_t lbn, daddr64_t dbn,
bp = getblk(vp, lbn, (int)fs->fs_bsize, 0, 0);
if (!(bp->b_flags & (B_DONE | B_DELWRI))) {
curproc->p_stats->p_ru.ru_inblock++; /* pay for read */
+ bcstats.pendingreads++;
+ bcstats.numreads++;
bp->b_flags |= B_READ;
if (bp->b_bcount > bp->b_bufsize)
panic("ffs_indirtrunc: bad buffer size");
diff --git a/sys/ufs/ffs/ffs_softdep.c b/sys/ufs/ffs/ffs_softdep.c
index e2f88051a8a..8b74fef005e 100644
--- a/sys/ufs/ffs/ffs_softdep.c
+++ b/sys/ufs/ffs/ffs_softdep.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: ffs_softdep.c,v 1.94 2008/01/05 19:49:26 otto Exp $ */
+/* $OpenBSD: ffs_softdep.c,v 1.95 2008/06/10 20:14:37 beck Exp $ */
/*
* Copyright 1998, 2000 Marshall Kirk McKusick. All Rights Reserved.
@@ -4691,9 +4691,9 @@ softdep_fsync_mountdev(vp, waitfor)
/*
* If it is already scheduled, skip to the next buffer.
*/
+ splassert(IPL_BIO);
if (bp->b_flags & B_BUSY)
continue;
- bp->b_flags |= B_BUSY;
if ((bp->b_flags & B_DELWRI) == 0) {
FREE_LOCK(&lk);
@@ -4705,10 +4705,10 @@ softdep_fsync_mountdev(vp, waitfor)
*/
if ((wk = LIST_FIRST(&bp->b_dep)) == NULL ||
wk->wk_type != D_BMSAFEMAP) {
- bp->b_flags &= ~B_BUSY;
continue;
}
bremfree(bp);
+ buf_acquire(bp);
FREE_LOCK(&lk);
(void) bawrite(bp);
ACQUIRE_LOCK(&lk);
@@ -5616,7 +5616,7 @@ getdirtybuf(bp, waitfor)
if ((bp->b_flags & B_DELWRI) == 0)
return (0);
bremfree(bp);
- bp->b_flags |= B_BUSY;
+ buf_acquire(bp);
return (1);
}
diff --git a/sys/ufs/ffs/ffs_vnops.c b/sys/ufs/ffs/ffs_vnops.c
index 9e08f8b475f..489a4880473 100644
--- a/sys/ufs/ffs/ffs_vnops.c
+++ b/sys/ufs/ffs/ffs_vnops.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: ffs_vnops.c,v 1.48 2008/05/08 17:45:45 thib Exp $ */
+/* $OpenBSD: ffs_vnops.c,v 1.49 2008/06/10 20:14:37 beck Exp $ */
/* $NetBSD: ffs_vnops.c,v 1.7 1996/05/11 18:27:24 mycroft Exp $ */
/*
@@ -449,7 +449,8 @@ loop:
}
bremfree(bp);
- bp->b_flags |= B_BUSY | B_SCANNED;
+ buf_acquire(bp);
+ bp->b_flags |= B_SCANNED;
splx(s);
/*
* On our final pass through, do all I/O synchronously
diff --git a/sys/ufs/ufs/ufs_bmap.c b/sys/ufs/ufs/ufs_bmap.c
index fa22c0934ec..75205c56bc3 100644
--- a/sys/ufs/ufs/ufs_bmap.c
+++ b/sys/ufs/ufs/ufs_bmap.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: ufs_bmap.c,v 1.26 2008/01/05 19:49:26 otto Exp $ */
+/* $OpenBSD: ufs_bmap.c,v 1.27 2008/06/10 20:14:37 beck Exp $ */
/* $NetBSD: ufs_bmap.c,v 1.3 1996/02/09 22:36:00 christos Exp $ */
/*
@@ -173,6 +173,8 @@ ufs_bmaparray(struct vnode *vp, daddr64_t bn, daddr64_t *bnp, struct indir *ap,
else {
bp->b_blkno = blkptrtodb(ump, daddr);
bp->b_flags |= B_READ;
+ bcstats.pendingreads++;
+ bcstats.numreads++;
VOP_STRATEGY(bp);
curproc->p_stats->p_ru.ru_inblock++; /* XXX */
if ((error = biowait(bp)) != 0) {
diff --git a/sys/ufs/ufs/ufs_dirhash.c b/sys/ufs/ufs/ufs_dirhash.c
index d6ebabe70fe..5391c977caa 100644
--- a/sys/ufs/ufs/ufs_dirhash.c
+++ b/sys/ufs/ufs/ufs_dirhash.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: ufs_dirhash.c,v 1.17 2007/10/08 19:26:48 krw Exp $ */
+/* $OpenBSD: ufs_dirhash.c,v 1.18 2008/06/10 20:14:37 beck Exp $ */
/*
* Copyright (c) 2001, 2002 Ian Dowse. All rights reserved.
*
@@ -211,7 +211,6 @@ ufsdirhash_build(struct inode *ip)
if (UFS_BUFATOFF(ip, (off_t)pos, NULL, &bp) != 0)
goto fail;
}
-
/* Add this entry to the hash. */
ep = (struct direct *)((char *)bp->b_data + (pos & bmask));
if (ep->d_reclen == 0 || ep->d_reclen >