summaryrefslogtreecommitdiff
path: root/sys/kern/vfs_bio.c
diff options
context:
space:
mode:
authorTed Unangst <tedu@cvs.openbsd.org>2014-08-31 21:08:49 +0000
committerTed Unangst <tedu@cvs.openbsd.org>2014-08-31 21:08:49 +0000
commit579dcf1c8b87926a3ab6438ba522dd9984e3a13a (patch)
treef76a1a977af9ce3c2ac73b8251221cf3515be9bf /sys/kern/vfs_bio.c
parent5cc132474e4467ab67f03b4b15f6741cc3554ca1 (diff)
replace LRU bufcache with something originally modelled after 2Q.
this should provide a degree of scan resistance, and also serves as a midway point for further development of multi queue algorithms. i've tried to minimize the risk and degree of regressions. probably ok beck
Diffstat (limited to 'sys/kern/vfs_bio.c')
-rw-r--r--sys/kern/vfs_bio.c162
1 files changed, 151 insertions, 11 deletions
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
index 51e19fed7b6..53bd6abf5f6 100644
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: vfs_bio.c,v 1.160 2014/07/13 15:48:41 tedu Exp $ */
+/* $OpenBSD: vfs_bio.c,v 1.161 2014/08/31 21:08:48 tedu Exp $ */
/* $NetBSD: vfs_bio.c,v 1.44 1996/06/11 11:15:36 pk Exp $ */
/*
@@ -66,6 +66,10 @@ int nobuffers;
int needbuffer;
struct bio_ops bioops;
+/* private bufcache functions */
+void bufcache_init(void);
+void bufcache_adjust(void);
+
/*
* Buffer pool for I/O buffers.
*/
@@ -248,6 +252,7 @@ bufadjust(int newbufpages)
}
buf_put(bp);
}
+ bufcache_adjust();
/*
* Wake up the cleaner if we have lots of dirty pages,
@@ -1171,26 +1176,113 @@ bcstats_print(
#endif
/* bufcache freelist code below */
+/*
+ * Copyright (c) 2014 Ted Unangst <tedu@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/*
+ * The code below implements a variant of the 2Q buffer cache algorithm by
+ * Johnson and Shasha.
+ *
+ * General Outline
+ * We divide the buffer cache into three working sets: current, previous,
+ * and long term. Each list is itself LRU and buffers get promoted and moved
+ * around between them. A buffer starts its life in the current working set.
+ * As time passes and newer buffers push it out, it will turn into the previous
+ * working set and is subject to recycling. But if it's accessed again from
+ * the previous working set, that's an indication that it's actually in the
+ * long term working set, so we promote it there. The separation of current
+ * and previous working sets prevents us from promoting a buffer that's only
+ * temporarily hot to the long term cache.
+ *
+ * The objective is to provide scan resistance by making the long term
+ * working set ineligible for immediate recycling, even as the current
+ * working set is rapidly turned over.
+ *
+ * Implementation
+ * The code below identifies the current, previous, and long term sets as
+ * hotqueue, coldqueue, and warmqueue. The hot and warm queues are capped at
+ * 1/3 of the total clean pages, after which point they start pushing their
+ * oldest buffers into coldqueue.
+ * A buf always starts out with neither WARM or COLD flags set (implying HOT).
+ * When released, it will be returned to the tail of the hotqueue list.
+ * When the hotqueue gets too large, the oldest hot buf will be moved to the
+ * coldqueue, with the B_COLD flag set. When a cold buf is released, we set
+ * the B_WARM flag and put it onto the warmqueue. Warm bufs are also
+ * directly returned to the end of the warmqueue. As with the hotqueue, when
+ * the warmqueue grows too large, bufs are moved onto the coldqueue.
+ *
+ * Note that this design does still support large working sets, greater
+ * than the cap of hotqueue or warmqueue would imply. The coldqueue is still
+ * cached and has no maximum length. The hot and warm queues form a Y feeding
+ * into the coldqueue. Moving bufs between queues is constant time, so this
+ * design decays to one long warm->cold queue.
+ *
+ * In the 2Q paper, hotqueue and coldqueue are A1in and A1out. The warmqueue
+ * is Am. We always cache pages, as opposed to pointers to pages for A1.
+ *
+ */
/*
- * simple LRU queues, one clean and one dirty
+ *
*/
TAILQ_HEAD(bufqueue, buf);
-struct bufqueue cleanqueue;
+struct bufqueue hotqueue;
+int64_t hotbufpages;
+struct bufqueue coldqueue;
+struct bufqueue warmqueue;
+int64_t warmbufpages;
struct bufqueue dirtyqueue;
+/*
+ * this function is called when a hot or warm queue may have exceeded its
+ * size limit. it will move a buf to the coldqueue.
+ */
+int chillbufs(struct bufqueue *queue, int64_t *queuepages);
+
void
bufcache_init(void)
{
- TAILQ_INIT(&cleanqueue);
+ TAILQ_INIT(&hotqueue);
+ TAILQ_INIT(&coldqueue);
+ TAILQ_INIT(&warmqueue);
TAILQ_INIT(&dirtyqueue);
}
+/*
+ * if the buffer cache shrunk, we may need to rebalance our queues.
+ */
+void
+bufcache_adjust(void)
+{
+ while (chillbufs(&warmqueue, &warmbufpages) ||
+ chillbufs(&hotqueue, &hotbufpages))
+ ;
+}
+
struct buf *
bufcache_getcleanbuf(void)
{
- return TAILQ_FIRST(&cleanqueue);
+ struct buf *bp;
+
+ if ((bp = TAILQ_FIRST(&coldqueue)))
+ return bp;
+ if ((bp = TAILQ_FIRST(&warmqueue)))
+ return bp;
+ return TAILQ_FIRST(&hotqueue);
}
struct buf *
@@ -1203,31 +1295,79 @@ void
bufcache_take(struct buf *bp)
{
struct bufqueue *queue;
+ int64_t pages;
splassert(IPL_BIO);
+ pages = atop(bp->b_bufsize);
if (!ISSET(bp->b_flags, B_DELWRI)) {
- queue = &cleanqueue;
- bcstats.numcleanpages -= atop(bp->b_bufsize);
+ if (ISSET(bp->b_flags, B_WARM)) {
+ queue = &warmqueue;
+ warmbufpages -= pages;
+ } else if (ISSET(bp->b_flags, B_COLD)) {
+ queue = &coldqueue;
+ } else {
+ queue = &hotqueue;
+ hotbufpages -= pages;
+ }
+ bcstats.numcleanpages -= pages;
} else {
queue = &dirtyqueue;
- bcstats.numdirtypages -= atop(bp->b_bufsize);
+ bcstats.numdirtypages -= pages;
bcstats.delwribufs--;
}
TAILQ_REMOVE(queue, bp, b_freelist);
}
+int
+chillbufs(struct bufqueue *queue, int64_t *queuepages)
+{
+ struct buf *bp;
+ int64_t limit, pages;
+
+ /*
+ * The warm and hot queues are allowed to be up to one third each.
+ * We impose a minimum size of 96 to prevent too much "wobbling".
+ */
+ limit = bcstats.numcleanpages / 3;
+ if (*queuepages > 96 && *queuepages > limit) {
+ bp = TAILQ_FIRST(queue);
+ if (!bp)
+ panic("inconsistent bufpage counts");
+ pages = atop(bp->b_bufsize);
+ *queuepages -= pages;
+ TAILQ_REMOVE(queue, bp, b_freelist);
+ CLR(bp->b_flags, B_WARM);
+ SET(bp->b_flags, B_COLD);
+ TAILQ_INSERT_TAIL(&coldqueue, bp, b_freelist);
+ return 1;
+ }
+ return 0;
+}
+
void
bufcache_release(struct buf *bp)
{
struct bufqueue *queue;
+ int64_t pages;
+ pages = atop(bp->b_bufsize);
if (!ISSET(bp->b_flags, B_DELWRI)) {
- queue = &cleanqueue;
- bcstats.numcleanpages += atop(bp->b_bufsize);
+ int64_t *queuepages;
+ if (ISSET(bp->b_flags, B_WARM | B_COLD)) {
+ SET(bp->b_flags, B_WARM);
+ queue = &warmqueue;
+ queuepages = &warmbufpages;
+ } else {
+ queue = &hotqueue;
+ queuepages = &hotbufpages;
+ }
+ *queuepages += pages;
+ bcstats.numcleanpages += pages;
+ chillbufs(queue, queuepages);
} else {
queue = &dirtyqueue;
- bcstats.numdirtypages += atop(bp->b_bufsize);
+ bcstats.numdirtypages += pages;
bcstats.delwribufs++;
}
TAILQ_INSERT_TAIL(queue, bp, b_freelist);