src - OpenBSD base system

diff options


context:
space:
mode:

author	Ted Unangst <tedu@cvs.openbsd.org>	2014-08-31 21:08:49 +0000
committer	Ted Unangst <tedu@cvs.openbsd.org>	2014-08-31 21:08:49 +0000
commit	579dcf1c8b87926a3ab6438ba522dd9984e3a13a (patch)
tree	f76a1a977af9ce3c2ac73b8251221cf3515be9bf /sys/kern/vfs_bio.c
parent	5cc132474e4467ab67f03b4b15f6741cc3554ca1 (diff)

replace LRU bufcache with something originally modelled after 2Q.

this should provide a degree of scan resistance, and also serves as a midway point for further development of multi queue algorithms. i've tried to minimize the risk and degree of regressions. probably ok beck

Diffstat (limited to 'sys/kern/vfs_bio.c')

-rw-r--r--

sys/kern/vfs_bio.c

162

1 files changed, 151 insertions, 11 deletions

diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
index 51e19fed7b6..53bd6abf5f6 100644
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c

@@ -1,4 +1,4 @@

-/* $OpenBSD: vfs_bio.c,v 1.160 2014/07/13 15:48:41 tedu Exp $ */

+/* $OpenBSD: vfs_bio.c,v 1.161 2014/08/31 21:08:48 tedu Exp $ */

/* $NetBSD: vfs_bio.c,v 1.44 1996/06/11 11:15:36 pk Exp $ */

@@ -66,6 +66,10 @@ int nobuffers;

int needbuffer;

struct bio_ops bioops;

+/* private bufcache functions */

+void bufcache_init(void);

+void bufcache_adjust(void);

* Buffer pool for I/O buffers.

@@ -248,6 +252,7 @@ bufadjust(int newbufpages)

}

buf_put(bp);

}

+ bufcache_adjust();

* Wake up the cleaner if we have lots of dirty pages,

@@ -1171,26 +1176,113 @@ bcstats_print(

#endif

/* bufcache freelist code below */

+/*

+ *

+ * Permission to use, copy, modify, and distribute this software for any

+ * purpose with or without fee is hereby granted, provided that the above

+ * copyright notice and this permission notice appear in all copies.

+ *

+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES

+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF

+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR

+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES

+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN

+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF

+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

+ */

+/*

+ * The code below implements a variant of the 2Q buffer cache algorithm by

+ * Johnson and Shasha.

+ *

+ * General Outline

+ * We divide the buffer cache into three working sets: current, previous,

+ * and long term. Each list is itself LRU and buffers get promoted and moved

+ * around between them. A buffer starts its life in the current working set.

+ * As time passes and newer buffers push it out, it will turn into the previous

+ * working set and is subject to recycling. But if it's accessed again from

+ * the previous working set, that's an indication that it's actually in the

+ * long term working set, so we promote it there. The separation of current

+ * and previous working sets prevents us from promoting a buffer that's only

+ * temporarily hot to the long term cache.

+ *

+ * The objective is to provide scan resistance by making the long term

+ * working set ineligible for immediate recycling, even as the current

+ * working set is rapidly turned over.

+ *

+ * Implementation

+ * The code below identifies the current, previous, and long term sets as

+ * hotqueue, coldqueue, and warmqueue. The hot and warm queues are capped at

+ * 1/3 of the total clean pages, after which point they start pushing their

+ * oldest buffers into coldqueue.

+ * A buf always starts out with neither WARM or COLD flags set (implying HOT).

+ * When released, it will be returned to the tail of the hotqueue list.

+ * When the hotqueue gets too large, the oldest hot buf will be moved to the

+ * coldqueue, with the B_COLD flag set. When a cold buf is released, we set

+ * the B_WARM flag and put it onto the warmqueue. Warm bufs are also

+ * directly returned to the end of the warmqueue. As with the hotqueue, when

+ * the warmqueue grows too large, bufs are moved onto the coldqueue.

+ *

+ * Note that this design does still support large working sets, greater

+ * than the cap of hotqueue or warmqueue would imply. The coldqueue is still

+ * cached and has no maximum length. The hot and warm queues form a Y feeding

+ * into the coldqueue. Moving bufs between queues is constant time, so this

+ * design decays to one long warm->cold queue.

+ *

+ * In the 2Q paper, hotqueue and coldqueue are A1in and A1out. The warmqueue

+ * is Am. We always cache pages, as opposed to pointers to pages for A1.

+ *

+ */

- * simple LRU queues, one clean and one dirty

+ *

TAILQ_HEAD(bufqueue, buf);

-struct bufqueue cleanqueue;

+struct bufqueue hotqueue;

+int64_t hotbufpages;

+struct bufqueue coldqueue;

+struct bufqueue warmqueue;

+int64_t warmbufpages;

struct bufqueue dirtyqueue;

+/*

+ * this function is called when a hot or warm queue may have exceeded its

+ * size limit. it will move a buf to the coldqueue.

+ */

+int chillbufs(struct bufqueue *queue, int64_t *queuepages);

void

bufcache_init(void)

{

- TAILQ_INIT(&cleanqueue);

+ TAILQ_INIT(&hotqueue);

+ TAILQ_INIT(&coldqueue);

+ TAILQ_INIT(&warmqueue);

TAILQ_INIT(&dirtyqueue);

}

+/*

+ * if the buffer cache shrunk, we may need to rebalance our queues.

+ */

+void

+bufcache_adjust(void)

+ while (chillbufs(&warmqueue, &warmbufpages) ||

+ chillbufs(&hotqueue, &hotbufpages))

+ ;

struct buf *

bufcache_getcleanbuf(void)

{

- return TAILQ_FIRST(&cleanqueue);

+ struct buf *bp;

+ if ((bp = TAILQ_FIRST(&coldqueue)))

+ return bp;

+ if ((bp = TAILQ_FIRST(&warmqueue)))

+ return bp;

+ return TAILQ_FIRST(&hotqueue);

}

struct buf *

@@ -1203,31 +1295,79 @@ void

bufcache_take(struct buf *bp)

{

struct bufqueue *queue;

+ int64_t pages;

splassert(IPL_BIO);

+ pages = atop(bp->b_bufsize);

if (!ISSET(bp->b_flags, B_DELWRI)) {

- queue = &cleanqueue;

- bcstats.numcleanpages -= atop(bp->b_bufsize);

+ if (ISSET(bp->b_flags, B_WARM)) {

+ queue = &warmqueue;

+ warmbufpages -= pages;

+ } else if (ISSET(bp->b_flags, B_COLD)) {

+ queue = &coldqueue;

+ } else {

+ queue = &hotqueue;

+ hotbufpages -= pages;

+ }

+ bcstats.numcleanpages -= pages;

} else {

queue = &dirtyqueue;

- bcstats.numdirtypages -= atop(bp->b_bufsize);

+ bcstats.numdirtypages -= pages;

bcstats.delwribufs--;

}

TAILQ_REMOVE(queue, bp, b_freelist);

}

+int

+chillbufs(struct bufqueue *queue, int64_t *queuepages)

+ struct buf *bp;

+ int64_t limit, pages;

+ /*

+ * The warm and hot queues are allowed to be up to one third each.

+ * We impose a minimum size of 96 to prevent too much "wobbling".

+ */

+ limit = bcstats.numcleanpages / 3;

+ if (*queuepages > 96 && *queuepages > limit) {

+ bp = TAILQ_FIRST(queue);

+ if (!bp)

+ panic("inconsistent bufpage counts");

+ pages = atop(bp->b_bufsize);

+ *queuepages -= pages;

+ TAILQ_REMOVE(queue, bp, b_freelist);

+ CLR(bp->b_flags, B_WARM);

+ SET(bp->b_flags, B_COLD);

+ TAILQ_INSERT_TAIL(&coldqueue, bp, b_freelist);

+ return 1;

+ }

+ return 0;

void

bufcache_release(struct buf *bp)

{

struct bufqueue *queue;

+ int64_t pages;

+ pages = atop(bp->b_bufsize);

if (!ISSET(bp->b_flags, B_DELWRI)) {

- queue = &cleanqueue;

- bcstats.numcleanpages += atop(bp->b_bufsize);

+ int64_t *queuepages;

+ if (ISSET(bp->b_flags, B_WARM | B_COLD)) {

+ SET(bp->b_flags, B_WARM);

+ queue = &warmqueue;

+ queuepages = &warmbufpages;

+ } else {

+ queue = &hotqueue;

+ queuepages = &hotbufpages;

+ }

+ *queuepages += pages;

+ bcstats.numcleanpages += pages;

+ chillbufs(queue, queuepages);

} else {

queue = &dirtyqueue;

- bcstats.numdirtypages += atop(bp->b_bufsize);

+ bcstats.numdirtypages += pages;

bcstats.delwribufs++;

}

TAILQ_INSERT_TAIL(queue, bp, b_freelist);