1 files changed, 40 insertions, 167 deletions
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
index 42435d9e05f..0e7c21786ec 100644
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@@ -1,7 +1,7 @@
-/*	$OpenBSD: vfs_bio.c,v 1.116 2009/06/06 18:06:22 art Exp $	*/
+/*	$OpenBSD: vfs_bio.c,v 1.117 2009/06/15 17:01:26 beck Exp $	*/
 /*	$NetBSD: vfs_bio.c,v 1.44 1996/06/11 11:15:36 pk Exp $	*/
 
-/*
+/*-
  * Copyright (c) 1994 Christopher G. Demetriou
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
@@ -62,6 +62,20 @@
 #include <miscfs/specfs/specdev.h>
 
 /*
+ * Definitions for the buffer hash lists.
+ */
+#define	BUFHASH(dvp, lbn)	\
+	(&bufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
+LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
+u_long	bufhash;
+
+/*
+ * Insq/Remq for the buffer hash lists.
+ */
+#define	binshash(bp, dp)	LIST_INSERT_HEAD(dp, bp, b_hash)
+#define	bremhash(bp)		LIST_REMOVE(bp, b_hash)
+
+/*
  * Definitions for the buffer free lists.
  */
 #define	BQUEUES		2		/* number of free buffer queues */
@@ -109,9 +123,6 @@ long hidirtypages;
 long locleanpages;
 long hicleanpages;
 long maxcleanpages;
-long backoffpages;	/* backoff counter for page allocations */
-long buflowpages;	/* bufpages low water mark */
-long bufhighpages; 	/* bufpages high water mark */
 
 /* XXX - should be defined here. */
 extern int bufcachepercent;
@@ -171,13 +182,9 @@ buf_put(struct buf *bp)
 		panic("buf_put: b_dep is not empty");
 #endif
 
+	bremhash(bp);
 	LIST_REMOVE(bp, b_list);
 	bcstats.numbufs--;
-	if (backoffpages) {
-		backoffpages -= atop(bp->b_bufsize);
-		if (backoffpages < 0)
-			backoffpages = 0;
-	}
 
 	if (buf_dealloc_mem(bp) != 0)
 		return;
@@ -193,7 +200,7 @@ bufinit(void)
 	struct bqueues *dp;
 
 	/* XXX - for now */
-	bufhighpages = buflowpages = bufpages = bufcachepercent = bufkvm = 0;
+	bufpages = bufcachepercent = bufkvm = 0;
 
 	/*
 	 * If MD code doesn't say otherwise, use 10% of kvm for mappings and
@@ -204,16 +211,6 @@ bufinit(void)
 	if (bufpages == 0)
 		bufpages = physmem * bufcachepercent / 100;
 
-	bufhighpages = bufpages;
-
-	/*
-	 * set the base backoff level for the buffer cache to bufpages.
-	 * we will not allow uvm to steal back more than this number of
-	 * pages
-	 */
-	buflowpages = physmem * 10 / 100;
-
-
 	if (bufkvm == 0)
 		bufkvm = (VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) / 10;
 
@@ -240,6 +237,7 @@ bufinit(void)
  	 */
 	buf_mem_init(bufkvm);
 
+	bufhashtbl = hashinit(bufpages / 4, M_CACHE, M_WAITOK, &bufhash);
 	hidirtypages = (bufpages / 4) * 3;
 	lodirtypages = bufpages / 2;
 
@@ -253,104 +251,6 @@ bufinit(void)
 	maxcleanpages = locleanpages;
 }
 
-/*
- * Change cachepct
- */
-void
-bufadjust(int newbufpages)
-{
-	/*
-	 * XXX - note, bufkvm was allocated once, based on 10% of physmem
-	 * see above.
-	 */
-	struct buf *bp;
-	int s;
-
-	s = splbio();
-	bufpages = newbufpages;
-
-	hidirtypages = (bufpages / 4) * 3;
-	lodirtypages = bufpages / 2;
-
-	/*
-	 * When we hit 95% of pages being clean, we bring them down to
-	 * 90% to have some slack.
-	 */
-	hicleanpages = bufpages - (bufpages / 20);
-	locleanpages = bufpages - (bufpages / 10);
-
-	maxcleanpages = locleanpages;
-
-	/*
-	 * If we we have more buffers allocated than bufpages,
-	 * free them up to get back down. this may possibly consume
-	 * all our clean pages...
-	 */
-	while ((bp = TAILQ_FIRST(&bufqueues[BQ_CLEAN])) &&
-	    (bcstats.numbufpages > bufpages)) {
-		bremfree(bp);
-		if (bp->b_vp) {
-			RB_REMOVE(buf_rb_bufs,
-			    &bp->b_vp->v_bufs_tree, bp);
-			brelvp(bp);
-		}
-		buf_put(bp);
-	}
-
-	/*
-	 * Wake up cleaner if we're getting low on pages. We might
-	 * now have too much dirty, or have fallen below our low
-	 * water mark on clean pages so we need to free more stuff
-	 * up.
-	 */
-	if (bcstats.numdirtypages >= hidirtypages ||
-	    bcstats.numcleanpages <= locleanpages)
-		wakeup(&bd_req);
-
-	/*
-	 * if immediate action has not freed up enough goo for us
-	 * to proceed - we tsleep and wait for the cleaner above
-	 * to do it's work and get us reduced down to sanity.
-	 */
-	while (bcstats.numbufpages > bufpages) {
-		tsleep(&needbuffer, PRIBIO, "needbuffer", 0);
-	}
-	splx(s);
-}
-
-/*
- * Make the buffer cache back off from cachepct.
- */
-int
-bufbackoff()
-{
-	/*
-	 * Back off the amount of buffer cache pages. Called by the page
-	 * daemon to consume buffer cache pages rather than swapping.
-	 *
-	 * On success, it frees N pages from the buffer cache, and sets
-	 * a flag so that the next N allocations from buf_get will recycle
-	 * a buffer rather than allocate a new one. It then returns 0 to the
-	 * caller. 
-	 *
-	 * on failure, it could free no pages from the buffer cache, does
-	 * nothing and returns -1 to the caller. 
-	 */
-	long d;
-
-	if (bufpages <= buflowpages) 
-		return(-1);
-
-	if (bufpages - BACKPAGES >= buflowpages)
-		d = BACKPAGES;
-	else
-		d = bufpages - buflowpages;
-	backoffpages = BACKPAGES;
-	bufadjust(bufpages - d);
-	backoffpages = BACKPAGES;
-	return(0);
-}
-
 struct buf *
 bio_doread(struct vnode *vp, daddr64_t blkno, int size, int async)
 {
@@ -776,12 +676,10 @@ brelse(struct buf *bp)
 			CLR(bp->b_flags, B_DELWRI);
 		}
 
-		if (bp->b_vp) {
-			RB_REMOVE(buf_rb_bufs, &bp->b_vp->v_bufs_tree,
-			    bp);
+		if (bp->b_vp)
 			brelvp(bp);
-		}
-		bp->b_vp = NULL;
+		bremhash(bp);
+		binshash(bp, &invalhash);
 
 		/*
 		 * If the buffer has no associated data, place it back in the
@@ -799,9 +697,6 @@ brelse(struct buf *bp)
 				CLR(bp->b_flags, B_WANTED);
 				wakeup(bp);
 			}
-			if (bp->b_vp != NULL)
-				RB_REMOVE(buf_rb_bufs,
-				    &bp->b_vp->v_bufs_tree, bp);
 			buf_put(bp);
 			splx(s);
 			return;
@@ -863,14 +758,15 @@ struct buf *
 incore(struct vnode *vp, daddr64_t blkno)
 {
 	struct buf *bp;
-	struct buf b;
-
-	/* Search buf lookup tree */
-	b.b_lblkno = blkno;
-	bp = RB_FIND(buf_rb_bufs, &vp->v_bufs_tree, &b);
-	if (bp && !ISSET(bp->b_flags, B_INVAL))
-		return(bp);
-	return(NULL);
+
+	/* Search hash chain */
+	LIST_FOREACH(bp, BUFHASH(vp, blkno), b_hash) {
+		if (bp->b_lblkno == blkno && bp->b_vp == vp &&
+		    !ISSET(bp->b_flags, B_INVAL))
+			return (bp);
+	}
+
+	return (NULL);
 }
 
 /*
@@ -885,7 +781,6 @@ struct buf *
 getblk(struct vnode *vp, daddr64_t blkno, int size, int slpflag, int slptimeo)
 {
 	struct buf *bp;
-	struct buf b;
 	int s, error;
 
 	/*
@@ -899,9 +794,9 @@ getblk(struct vnode *vp, daddr64_t blkno, int size, int slpflag, int slptimeo)
 	 * the block until the write is finished.
 	 */
 start:
-	b.b_lblkno = blkno;
-	bp = RB_FIND(buf_rb_bufs, &vp->v_bufs_tree, &b);
-	if (bp != NULL) {
+	LIST_FOREACH(bp, BUFHASH(vp, blkno), b_hash) {
+		if (bp->b_lblkno != blkno || bp->b_vp != vp)
+			continue;
 
 		s = splbio();
 		if (ISSET(bp->b_flags, B_BUSY)) {
@@ -950,24 +845,11 @@ geteblk(int size)
 struct buf *
 buf_get(struct vnode *vp, daddr64_t blkno, size_t size)
 {
-	static int gcount = 0;
 	struct buf *bp;
 	int poolwait = size == 0 ? PR_NOWAIT : PR_WAITOK;
 	int npages;
 	int s;
 
-	/*
-	 * if we were previously backed off, slowly climb back up
-	 * to the high water mark again.
-	 */
-	if ((backoffpages == 0) && (bufpages < bufhighpages)) {
-		if ( gcount == 0 )  {
-			bufadjust(bufpages + BACKPAGES);
-			gcount += BACKPAGES;
-		} else
-			gcount--;
-	}
-
 	s = splbio();
 	if (size) {
 		/*
@@ -985,11 +867,8 @@ buf_get(struct vnode *vp, daddr64_t blkno, size_t size)
 			while (bcstats.numcleanpages > locleanpages) {
 				bp = TAILQ_FIRST(&bufqueues[BQ_CLEAN]);
 				bremfree(bp);
-				if (bp->b_vp) {
-					RB_REMOVE(buf_rb_bufs,
-					    &bp->b_vp->v_bufs_tree, bp);
+				if (bp->b_vp)
 					brelvp(bp);
-				}
 				buf_put(bp);
 			}
 		}
@@ -999,21 +878,16 @@ buf_get(struct vnode *vp, daddr64_t blkno, size_t size)
 		/*
 		 * Free some buffers until we have enough space.
 		 */
-		while ((bcstats.numbufpages + npages > bufpages)
-		    || backoffpages) {
+		while (bcstats.numbufpages + npages > bufpages) {
 			int freemax = 5;
 			int i = freemax;
 			while ((bp = TAILQ_FIRST(&bufqueues[BQ_CLEAN])) && i--) {
 				bremfree(bp);
-				if (bp->b_vp) {
-					RB_REMOVE(buf_rb_bufs,
-					    &bp->b_vp->v_bufs_tree, bp);
+				if (bp->b_vp)
 					brelvp(bp);
-				}
 				buf_put(bp);
 			}
-			if (freemax == i &&
-			    (bcstats.numbufpages + npages > bufpages)) {
+			if (freemax == i) {
 				needbuffer++;
 				tsleep(&needbuffer, PRIBIO, "needbuffer", 0);
 				splx(s);
@@ -1054,12 +928,11 @@ buf_get(struct vnode *vp, daddr64_t blkno, size_t size)
 
 		bp->b_blkno = bp->b_lblkno = blkno;
 		bgetvp(vp, bp);
-		if (RB_INSERT(buf_rb_bufs, &vp->v_bufs_tree, bp))
-			panic("buf_get: dup lblk vp %p bp %p", vp, bp);
+		binshash(bp, BUFHASH(vp, blkno));
 	} else {
 		bp->b_vnbufs.le_next = NOLIST;
 		SET(bp->b_flags, B_INVAL);
-		bp->b_vp = NULL;
+		binshash(bp, &invalhash);
 	}
 
 	LIST_INSERT_HEAD(&bufhead, bp, b_list);