/*	$OpenBSD: vm_pageout.c,v 1.11 2001/03/21 23:24:51 art Exp $	*/
/*	$NetBSD: vm_pageout.c,v 1.23 1996/02/05 01:54:07 christos Exp $	*/

/* 
 * Copyright (c) 1991, 1993
 *	The Regents of the University of California.  All rights reserved.
 *
 * This code is derived from software contributed to Berkeley by
 * The Mach Operating System project at Carnegie-Mellon University.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *	This product includes software developed by the University of
 *	California, Berkeley and its contributors.
 * 4. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *	@(#)vm_pageout.c	8.7 (Berkeley) 6/19/95
 *
 *
 * Copyright (c) 1987, 1990 Carnegie-Mellon University.
 * All rights reserved.
 *
 * Authors: Avadis Tevanian, Jr., Michael Wayne Young
 * 
 * Permission to use, copy, modify and distribute this software and
 * its documentation is hereby granted, provided that both the copyright
 * notice and this permission notice appear in all copies of the
 * software, derivative works or modified versions, and any portions
 * thereof, and that both notices appear in supporting documentation.
 * 
 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 
 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 
 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
 * 
 * Carnegie Mellon requests users of this software to return to
 *
 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
 *  School of Computer Science
 *  Carnegie Mellon University
 *  Pittsburgh PA 15213-3890
 *
 * any improvements or extensions that they make and grant Carnegie the
 * rights to redistribute these changes.
 */

/*
 *	The proverbial page-out daemon.
 */

#include <sys/param.h>
#include <sys/proc.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/pool.h>

#include <vm/vm.h>
#include <vm/vm_page.h>
#include <vm/vm_pageout.h>

#ifndef VM_PAGE_FREE_MIN
#define VM_PAGE_FREE_MIN	(cnt.v_free_count / 20)
#endif

#ifndef VM_PAGE_FREE_TARGET
#define VM_PAGE_FREE_TARGET	((cnt.v_free_min * 4) / 3)
#endif

int	vm_page_free_min_min = 16 * 1024;
int	vm_page_free_min_max = 256 * 1024;

int	vm_pages_needed;	/* Event on which pageout daemon sleeps */

int	vm_page_max_wired = 0;	/* XXX max # of wired pages system-wide */

#ifdef CLUSTERED_PAGEOUT
#define MAXPOCLUSTER		(MAXPHYS/NBPG)	/* XXX */
int doclustered_pageout = 1;
#endif

/*
 * Activate the pageout daemon and sleep awaiting more free memory
 */
void
vm_wait(msg)
	char *msg;
{
	int timo = 0;

	if(curproc == pageout_daemon) {
		/*
		 * We might be toast here, but IF some paging operations
		 * are pending then pages will magically appear. We
		 * usually can't return an error because callers of
		 * malloc who can wait generally don't check for
		 * failure.
		 *
		 * Only the pageout_daemon wakes up this channel!
		 */
		printf("pageout daemon has stalled\n");
		timo = hz >> 3;
	}
	simple_lock(&vm_pages_needed_lock);
	thread_wakeup(&vm_pages_needed);
	thread_sleep_msg(&cnt.v_free_count, &vm_pages_needed_lock, FALSE, msg,
		timo);
}

/*
 *	vm_pageout_scan does the dirty work for the pageout daemon.
 */
void
vm_pageout_scan()
{
	register vm_page_t	m, next;
	register int		page_shortage;
	register int		s;
	register int		pages_freed;
	int			free;
	vm_object_t		object;

	/*
	 *	Only continue when we want more pages to be "free"
	 */

	cnt.v_rev++;

	s = splimp();
	simple_lock(&vm_page_queue_free_lock);
	free = cnt.v_free_count;
	simple_unlock(&vm_page_queue_free_lock);
	splx(s);

#ifndef __SWAP_BROKEN					/* XXX */
	if (free < cnt.v_free_target) {
		swapout_threads();

		/*
		 *	Be sure the pmap system is updated so
		 *	we can scan the inactive queue.
		 */

		pmap_update();
	}
#endif							/* XXX */

	/*
	 *	Acquire the resident page system lock,
	 *	as we may be changing what's resident quite a bit.
	 */
	vm_page_lock_queues();

	/*
	 *	Start scanning the inactive queue for pages we can free.
	 *	We keep scanning until we have enough free pages or
	 *	we have scanned through the entire queue.  If we
	 *	encounter dirty pages, we start cleaning them.
	 */

	pages_freed = 0;
	for (m = vm_page_queue_inactive.tqh_first; m != NULL; m = next) {
		s = splimp();
		simple_lock(&vm_page_queue_free_lock);
		free = cnt.v_free_count;
		simple_unlock(&vm_page_queue_free_lock);
		splx(s);
		if (free >= cnt.v_free_target)
			break;

		cnt.v_scan++;
		next = m->pageq.tqe_next;

		/*
		 * If the page has been referenced, move it back to the
		 * active queue.
		 */
		if (pmap_is_referenced(VM_PAGE_TO_PHYS(m))) {
			vm_page_activate(m);
			cnt.v_reactivated++;
			continue;
		}

		/*
		 * If the page is clean, free it up.
		 */
		if (m->flags & PG_CLEAN) {
			object = m->object;
			if (vm_object_lock_try(object)) {
				pmap_page_protect(VM_PAGE_TO_PHYS(m),
						  VM_PROT_NONE);
				vm_page_free(m);
				pages_freed++;
				cnt.v_dfree++;
				vm_object_unlock(object);
			}
			continue;
		}

		/*
		 * If the page is dirty but already being washed, skip it.
		 */
		if ((m->flags & PG_LAUNDRY) == 0)
			continue;

		/*
		 * Otherwise the page is dirty and still in the laundry,
		 * so we start the cleaning operation and remove it from
		 * the laundry.
		 */
		object = m->object;
		if (!vm_object_lock_try(object))
			continue;
#ifdef CLUSTERED_PAGEOUT
		if (object->pager &&
		    vm_pager_cancluster(object->pager, PG_CLUSTERPUT))
			vm_pageout_cluster(m, object);
		else
#endif
		vm_pageout_page(m, object);
		thread_wakeup(object);
		vm_object_unlock(object);
		/*
		 * Former next page may no longer even be on the inactive
		 * queue (due to potential blocking in the pager with the
		 * queues unlocked).  If it isn't, we just start over.
		 */
		if (next && (next->flags & PG_INACTIVE) == 0)
			next = vm_page_queue_inactive.tqh_first;
	}
	
	/*
	 *	Compute the page shortage.  If we are still very low on memory
	 *	be sure that we will move a minimal amount of pages from active
	 *	to inactive.
	 */

	page_shortage = cnt.v_inactive_target - cnt.v_inactive_count;
	if (page_shortage <= 0 && pages_freed == 0)
		page_shortage = 1;

	while (page_shortage > 0) {
		/*
		 *	Move some more pages from active to inactive.
		 */

		if ((m = vm_page_queue_active.tqh_first) == NULL)
			break;
		vm_page_deactivate(m);
		page_shortage--;
	}

	vm_page_unlock_queues();
}

/*
 * Called with object and page queues locked.
 * If reactivate is TRUE, a pager error causes the page to be
 * put back on the active queue, ow it is left on the inactive queue.
 */
void
vm_pageout_page(m, object)
	vm_page_t m;
	vm_object_t object;
{
	vm_pager_t pager;
	int pageout_status;

	/*
	 * We set the busy bit to cause potential page faults on
	 * this page to block.
	 *
	 * We also set pageout-in-progress to keep the object from
	 * disappearing during pageout.  This guarantees that the
	 * page won't move from the inactive queue.  (However, any
	 * other page on the inactive queue may move!)
	 */
	pmap_page_protect(VM_PAGE_TO_PHYS(m), VM_PROT_NONE);
	m->flags |= PG_BUSY;

	/*
	 * Try to collapse the object before making a pager for it.
	 * We must unlock the page queues first.
	 */
	vm_page_unlock_queues();

#if 0
	/*
	 * vm_object_collapse might want to sleep waiting for pages which
	 * is not allowed to do in this thread.  Anyway, we now aggressively
	 * collapse object-chains as early as possible so this call ought
	 * to not be very useful anyhow.  This is just an educated guess.
	 * Not doing a collapse operation is never fatal though, so we skip
	 * it for the time being.  Later we might add some NOWAIT option for
	 * the collapse code to look at, if it's deemed necessary.
	 */
	if (object->pager == NULL)
		vm_object_collapse(object);
#endif

	vm_object_paging_begin(object);
	vm_object_unlock(object);

	/*
	 * We _used_ to wakeup page consumers here, "in case the following
	 * operations block". That leads to livelock if the pageout fails,
	 * which is actually quite a common thing for NFS paging.
	 */

	/*
	 * If there is no pager for the page, use the default pager.
	 * If there is no place to put the page at the moment,
	 * leave it in the laundry and hope that there will be
	 * paging space later.
	 */
	if ((pager = object->pager) == NULL) {
		pager = vm_pager_allocate(PG_DFLT, (caddr_t)0, object->size,
		    VM_PROT_ALL, (vm_offset_t)0);
		if (pager != NULL)
			vm_object_setpager(object, pager, 0, FALSE);
	}
	pageout_status = pager ? vm_pager_put(pager, m, FALSE) : VM_PAGER_FAIL;
	vm_object_lock(object);
	vm_page_lock_queues();

	switch (pageout_status) {
	case VM_PAGER_OK:
	case VM_PAGER_PEND:
		/* hmm, don't wakeup if memory is _very_ low? */
		thread_wakeup(&cnt.v_free_count);
		cnt.v_pageouts++;
		cnt.v_pgpgout++;
		m->flags &= ~PG_LAUNDRY;
		break;
	case VM_PAGER_BAD:
		/*
		 * Page outside of range of object.  Right now we
		 * essentially lose the changes by pretending it
		 * worked.
		 *
		 * XXX dubious, what should we do?
		 */
		m->flags &= ~PG_LAUNDRY;
		m->flags |= PG_CLEAN;
		pmap_clear_modify(VM_PAGE_TO_PHYS(m));
		break;
	case VM_PAGER_AGAIN:
	{
		/*
		 * FAIL on a write is interpreted to mean a resource
		 * shortage, so we put pause for awhile and try again.
		 * XXX could get stuck here.
		 */
		(void)tsleep((caddr_t)&vm_pages_needed, PZERO|PCATCH,
		    "pageout", hz>>3);
		break;
	}
	case VM_PAGER_FAIL:
	case VM_PAGER_ERROR:
		/*
		 * If page couldn't be paged out, then reactivate
		 * the page so it doesn't clog the inactive list.
		 * (We will try paging out it again later).
		 */
		vm_page_activate(m);
		cnt.v_reactivated++;
		break;
	}

	pmap_clear_reference(VM_PAGE_TO_PHYS(m));

	/*
	 * If the operation is still going, leave the page busy
	 * to block all other accesses.  Also, leave the paging
	 * in progress indicator set so that we don't attempt an
	 * object collapse.
	 */
	if (pageout_status != VM_PAGER_PEND) {
		m->flags &= ~PG_BUSY;
		PAGE_WAKEUP(m);
		vm_object_paging_end(object);
	}
}

#ifdef CLUSTERED_PAGEOUT
#define PAGEOUTABLE(p) \
	((((p)->flags & (PG_INACTIVE|PG_CLEAN|PG_LAUNDRY)) == \
	  (PG_INACTIVE|PG_LAUNDRY)) && !pmap_is_referenced(VM_PAGE_TO_PHYS(p)))

/*
 * Attempt to pageout as many contiguous (to ``m'') dirty pages as possible
 * from ``object''.  Using information returned from the pager, we assemble
 * a sorted list of contiguous dirty pages and feed them to the pager in one
 * chunk.  Called with paging queues and object locked.  Also, object must
 * already have a pager.
 */
void
vm_pageout_cluster(m, object)
	vm_page_t m;
	vm_object_t object;
{
	vm_offset_t offset, loff, hoff;
	vm_page_t plist[MAXPOCLUSTER], *plistp, p;
	int postatus, ix, count;

	cnt.v_pageouts++;
	/*
	 * Determine the range of pages that can be part of a cluster
	 * for this object/offset.  If it is only our single page, just
	 * do it normally.
	 */
	vm_pager_cluster(object->pager, m->offset, &loff, &hoff);
	if (hoff - loff == PAGE_SIZE) {
		vm_pageout_page(m, object);
		return;
	}

	plistp = plist;

	/*
	 * Target page is always part of the cluster.
	 */
	pmap_page_protect(VM_PAGE_TO_PHYS(m), VM_PROT_NONE);
	m->flags |= PG_BUSY;
	plistp[atop(m->offset - loff)] = m;
	count = 1;

	/*
	 * Backup from the given page til we find one not fulfilling
	 * the pageout criteria or we hit the lower bound for the
	 * cluster.  For each page determined to be part of the
	 * cluster, unmap it and busy it out so it won't change.
	 */
	ix = atop(m->offset - loff);
	offset = m->offset;
	while (offset > loff && count < MAXPOCLUSTER-1) {
		p = vm_page_lookup(object, offset - PAGE_SIZE);
		if (p == NULL || !PAGEOUTABLE(p))
			break;
		pmap_page_protect(VM_PAGE_TO_PHYS(p), VM_PROT_NONE);
		p->flags |= PG_BUSY;
		plistp[--ix] = p;
		offset -= PAGE_SIZE;
		count++;
	}
	plistp += atop(offset - loff);
	loff = offset;

	/*
	 * Now do the same moving forward from the target.
	 */
	ix = atop(m->offset - loff) + 1;
	offset = m->offset + PAGE_SIZE;
	while (offset < hoff && count < MAXPOCLUSTER) {
		p = vm_page_lookup(object, offset);
		if (p == NULL || !PAGEOUTABLE(p))
			break;
		pmap_page_protect(VM_PAGE_TO_PHYS(p), VM_PROT_NONE);
		p->flags |= PG_BUSY;
		plistp[ix++] = p;
		offset += PAGE_SIZE;
		count++;
	}
	hoff = offset;

	/*
	 * Pageout the page.
	 * Unlock everything and do a wakeup prior to the pager call
	 * in case it blocks.
	 */
	vm_page_unlock_queues();
	vm_object_paging_begin(object);
	vm_object_unlock(object);
again:
	thread_wakeup(&cnt.v_free_count);
	postatus = vm_pager_put_pages(object->pager, plistp, count, FALSE);
	/*
	 * XXX rethink this
	 */
	if (postatus == VM_PAGER_AGAIN) {
		(void)tsleep((caddr_t)&vm_pages_needed, PZERO|PCATCH,
		    "pageout", 0);
		goto again;
	} else if (postatus == VM_PAGER_BAD)
		panic("vm_pageout_cluster: VM_PAGER_BAD");
	vm_object_lock(object);
	vm_page_lock_queues();

	/*
	 * Loop through the affected pages, reflecting the outcome of
	 * the operation.
	 */
	for (ix = 0; ix < count; ix++) {
		p = *plistp++;
		switch (postatus) {
		case VM_PAGER_OK:
		case VM_PAGER_PEND:
			cnt.v_pgpgout++;
			p->flags &= ~PG_LAUNDRY;
			break;
		case VM_PAGER_FAIL:
		case VM_PAGER_ERROR:
			/*
			 * Pageout failed, reactivate the target page so it
			 * doesn't clog the inactive list.  Other pages are
			 * left as they are.
			 */
			if (p == m) {
				vm_page_activate(p);
				cnt.v_reactivated++;
			}
			break;
		}
		pmap_clear_reference(VM_PAGE_TO_PHYS(p));
		/*
		 * If the operation is still going, leave the page busy
		 * to block all other accesses.
		 */
		if (postatus != VM_PAGER_PEND) {
			p->flags &= ~PG_BUSY;
			PAGE_WAKEUP(p);
		}
	}
	/*
	 * If the operation is still going, leave the paging in progress
	 * indicator set so that we don't attempt an object collapse.
	 */
	if (postatus != VM_PAGER_PEND)
		vm_object_paging_end(object);
}
#endif

/*
 *	vm_pageout is the high level pageout daemon.
 */

void
vm_pageout()
{
	pageout_daemon = curproc;
	(void) spl0();

	/*
	 *	Initialize some paging parameters.
	 */

	if (cnt.v_free_min == 0) {
		cnt.v_free_min = VM_PAGE_FREE_MIN;
		vm_page_free_min_min /= cnt.v_page_size;
		vm_page_free_min_max /= cnt.v_page_size;
		if (cnt.v_free_min < vm_page_free_min_min)
			cnt.v_free_min = vm_page_free_min_min;
		if (cnt.v_free_min > vm_page_free_min_max)
			cnt.v_free_min = vm_page_free_min_max;
	}

	if (cnt.v_free_target == 0)
		cnt.v_free_target = VM_PAGE_FREE_TARGET;

	if (cnt.v_free_target <= cnt.v_free_min)
		cnt.v_free_target = cnt.v_free_min + 1;

	/* XXX does not really belong here */
	if (vm_page_max_wired == 0)
		vm_page_max_wired = cnt.v_free_count / 3;

	/*
	 *	The pageout daemon is never done, so loop
	 *	forever.
	 */

	simple_lock(&vm_pages_needed_lock);
	while (TRUE) {
		thread_sleep_msg(&vm_pages_needed, &vm_pages_needed_lock,
			FALSE, "paged", 0);
		/*
		 * Compute the inactive target for this scan.
		 * We need to keep a reasonable amount of memory in the
		 * inactive list to better simulate LRU behavior.
		 */
		cnt.v_inactive_target =
			(cnt.v_active_count + cnt.v_inactive_count) / 3;
		if (cnt.v_inactive_target <= cnt.v_free_target)
			cnt.v_inactive_target = cnt.v_free_target + 1;

		/*
		 * Only make a scan if we are likely to do something.
		 * Otherwise we might have been awakened by a pager
		 * to clean up async pageouts.
		 */
		if (cnt.v_free_count < cnt.v_free_target ||
		    cnt.v_inactive_count < cnt.v_inactive_target) {
			pool_drain(0);
			vm_pageout_scan();
		}
		vm_pager_sync();
		simple_lock(&vm_pages_needed_lock);
		thread_wakeup(&cnt.v_free_count);
	}
}