From 99719d09f990cf65f3f9922bab47d9a3e85b2974 Mon Sep 17 00:00:00 2001
From: Bob Beck <beck@cvs.openbsd.org>
Date: Fri, 5 Jun 2009 04:29:15 +0000
Subject: Dynamic buffer cache sizing.

This commit won't change the default behaviour of the system unless the
buffer cache size is increased with sysctl kern.bufcachepercent. By default
our buffer cache is 10% of memory, which with this commit is now treated
as a low water mark.  If the buffer cache size is increased, the new size
is treated as a high water mark and the buffer cache is permitted to grow
to that percentage of memory.

If the page daemon is invoked, the page daemon will ask the buffer cache
to relenquish pages. if the buffer cache has more than the low water mark it
will relenquish pages allowing them to be consumed by uvm. after a short
period the buffer cache will attempt to re-grow back to the high water mark.

This permits the use of a large buffer cache without penalizing the available
memory for other purposes.

Above the low water mark the buffer cache remains entirely subservient to
the page daemon, so if uvm requires pages, the buffer cache will abandon
them.

ok art@ thib@ oga@
---
 sys/kern/vfs_bio.c | 114 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 109 insertions(+), 5 deletions(-)

(limited to 'sys/kern/vfs_bio.c')

diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
index c106a1a1b36..8e900669145 100644
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@@ -1,7 +1,7 @@
-/*	$OpenBSD: vfs_bio.c,v 1.114 2009/06/03 21:30:20 beck Exp $	*/
+/*	$OpenBSD: vfs_bio.c,v 1.115 2009/06/05 04:29:14 beck Exp $	*/
 /*	$NetBSD: vfs_bio.c,v 1.44 1996/06/11 11:15:36 pk Exp $	*/
 
-/*-
+/*
  * Copyright (c) 1994 Christopher G. Demetriou
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
@@ -109,6 +109,9 @@ long hidirtypages;
 long locleanpages;
 long hicleanpages;
 long maxcleanpages;
+long backoffpages;	/* backoff counter for page allocations */
+long buflowpages;	/* bufpages low water mark */
+long bufhighpages; 	/* bufpages high water mark */
 
 /* XXX - should be defined here. */
 extern int bufcachepercent;
@@ -170,6 +173,11 @@ buf_put(struct buf *bp)
 
 	LIST_REMOVE(bp, b_list);
 	bcstats.numbufs--;
+	if (backoffpages) {
+		backoffpages -= atop(bp->b_bufsize);
+		if (backoffpages < 0)
+			backoffpages = 0;
+	}
 
 	if (buf_dealloc_mem(bp) != 0)
 		return;
@@ -185,7 +193,7 @@ bufinit(void)
 	struct bqueues *dp;
 
 	/* XXX - for now */
-	bufpages = bufcachepercent = bufkvm = 0;
+	bufhighpages = buflowpages = bufpages = bufcachepercent = bufkvm = 0;
 
 	/*
 	 * If MD code doesn't say otherwise, use 10% of kvm for mappings and
@@ -196,6 +204,16 @@ bufinit(void)
 	if (bufpages == 0)
 		bufpages = physmem * bufcachepercent / 100;
 
+	bufhighpages = bufpages;
+
+	/*
+	 * set the base backoff level for the buffer cache to bufpages.
+	 * we will not allow uvm to steal back more than this number of
+	 * pages
+	 */
+	buflowpages = physmem * 10 / 100;
+
+
 	if (bufkvm == 0)
 		bufkvm = (VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) / 10;
 
@@ -245,7 +263,10 @@ bufadjust(int newbufpages)
 	 * XXX - note, bufkvm was allocated once, based on 10% of physmem
 	 * see above.
 	 */
+	struct buf *bp;
+	int s;
 
+	s = splbio();
 	bufpages = newbufpages;
 
 	hidirtypages = (bufpages / 4) * 3;
@@ -259,8 +280,76 @@ bufadjust(int newbufpages)
 	locleanpages = bufpages - (bufpages / 10);
 
 	maxcleanpages = locleanpages;
+
+	/*
+	 * If we we have more buffers allocated than bufpages,
+	 * free them up to get back down. this may possibly consume
+	 * all our clean pages...
+	 */
+	while ((bp = TAILQ_FIRST(&bufqueues[BQ_CLEAN])) &&
+	    (bcstats.numbufpages > bufpages)) {
+		bremfree(bp);
+		if (bp->b_vp) {
+			RB_REMOVE(buf_rb_bufs,
+			    &bp->b_vp->v_bufs_tree, bp);
+			brelvp(bp);
+		}
+		buf_put(bp);
+	}
+
+	/*
+	 * Wake up cleaner if we're getting low on pages. We might
+	 * now have too much dirty, or have fallen below our low
+	 * water mark on clean pages so we need to free more stuff
+	 * up.
+	 */
+	if (bcstats.numdirtypages >= hidirtypages ||
+	    bcstats.numcleanpages <= locleanpages)
+		wakeup(&bd_req);
+
+	/*
+	 * if immediate action has not freed up enough goo for us
+	 * to proceed - we tsleep and wait for the cleaner above
+	 * to do it's work and get us reduced down to sanity.
+	 */
+	while (bcstats.numbufpages > bufpages) {
+		tsleep(&needbuffer, PRIBIO, "needbuffer", 0);
+	}
+	splx(s);
 }
 
+/*
+ * Make the buffer cache back off from cachepct.
+ */
+int
+bufbackoff()
+{
+	/*
+	 * Back off the amount of buffer cache pages. Called by the page
+	 * daemon to consume buffer cache pages rather than swapping.
+	 *
+	 * On success, it frees N pages from the buffer cache, and sets
+	 * a flag so that the next N allocations from buf_get will recycle
+	 * a buffer rather than allocate a new one. It then returns 0 to the
+	 * caller. 
+	 *
+	 * on failure, it could free no pages from the buffer cache, does
+	 * nothing and returns -1 to the caller. 
+	 */
+	long d;
+
+	if (bufpages <= buflowpages) 
+		return(-1);
+
+	if (bufpages - BACKPAGES >= buflowpages)
+		d = BACKPAGES;
+	else
+		d = bufpages - buflowpages;
+	backoffpages = BACKPAGES;
+	bufadjust(bufpages - d);
+	backoffpages = BACKPAGES;
+	return(0);
+}
 
 struct buf *
 bio_doread(struct vnode *vp, daddr64_t blkno, int size, int async)
@@ -862,11 +951,24 @@ geteblk(int size)
 struct buf *
 buf_get(struct vnode *vp, daddr64_t blkno, size_t size)
 {
+	static int gcount = 0;
 	struct buf *bp;
 	int poolwait = size == 0 ? PR_NOWAIT : PR_WAITOK;
 	int npages;
 	int s;
 
+	/*
+	 * if we were previously backed off, slowly climb back up
+	 * to the high water mark again.
+	 */
+	if ((backoffpages == 0) && (bufpages < bufhighpages)) {
+		if ( gcount == 0 )  {
+			bufadjust(bufpages + BACKPAGES);
+			gcount += BACKPAGES;
+		} else
+			gcount--;
+	}
+
 	s = splbio();
 	if (size) {
 		/*
@@ -898,7 +1000,8 @@ buf_get(struct vnode *vp, daddr64_t blkno, size_t size)
 		/*
 		 * Free some buffers until we have enough space.
 		 */
-		while (bcstats.numbufpages + npages > bufpages) {
+		while ((bcstats.numbufpages + npages > bufpages)
+		    || backoffpages) {
 			int freemax = 5;
 			int i = freemax;
 			while ((bp = TAILQ_FIRST(&bufqueues[BQ_CLEAN])) && i--) {
@@ -910,7 +1013,8 @@ buf_get(struct vnode *vp, daddr64_t blkno, size_t size)
 				}
 				buf_put(bp);
 			}
-			if (freemax == i) {
+			if (freemax == i &&
+			    (bcstats.numbufpages + npages > bufpages)) {
 				needbuffer++;
 				tsleep(&needbuffer, PRIBIO, "needbuffer", 0);
 				splx(s);
-- 
cgit v1.2.3