src - OpenBSD base system

diff options


context:
space:
mode:

author	Owain Ainsworth <oga@cvs.openbsd.org>	2009-06-16 17:14:16 +0000
committer	Owain Ainsworth <oga@cvs.openbsd.org>	2009-06-16 17:14:16 +0000
commit	42caf8629b478d48a8120571cf83377282e2c536 (patch)
tree	434bf60ed97f9ede587aa6d59131d935df1be782
parent	df471f546425532f5eb23a525aa177e7cebfc201 (diff)

Backout all the PG_RELEASED changes.

This is for the same reason as the earlier backouts, to avoid the bug either added or exposed sometime around c2k9. This *should* be the last one. prompted by deraadt@ ok ariane@

Diffstat

-rw-r--r--

sys/uvm/uvm_aobj.c

192

-rw-r--r--

-rw-r--r--

-rw-r--r--

-rw-r--r--

334

5 files changed, 474 insertions, 157 deletions

diff --git a/sys/uvm/uvm_aobj.c b/sys/uvm/uvm_aobj.c
index b2a68d6d249..11c0cc181bf 100644
--- a/sys/uvm/uvm_aobj.c
+++ b/sys/uvm/uvm_aobj.c

@@ -1,4 +1,4 @@

-/* $OpenBSD: uvm_aobj.c,v 1.43 2009/06/16 00:11:29 oga Exp $ */

+/* $OpenBSD: uvm_aobj.c,v 1.44 2009/06/16 17:14:14 oga Exp $ */

/* $NetBSD: uvm_aobj.c,v 1.39 2001/02/18 21:19:08 chs Exp $ */

@@ -139,7 +139,7 @@ struct pool uao_swhash_elt_pool;

* uvm_aobj: the actual anon-backed uvm_object

* => the uvm_object is at the top of the structure, this allows

- * (struct uvm_aobj *) == (struct uvm_object *)

+ * (struct uvm_device *) == (struct uvm_object *)

* => only one of u_swslots and u_swhash is used in any given aobj

@@ -562,7 +562,7 @@ uao_init(void)

simple_lock_init(&uao_list_lock);

- * NOTE: Pages for this pool must not come from a pageable

+ * NOTE: Pages fror this pool must not come from a pageable

* kernel map!

pool_init(&uao_swhash_elt_pool, sizeof(struct uao_swhash_elt),

@@ -638,7 +638,8 @@ void

uao_detach_locked(struct uvm_object *uobj)

{

struct uvm_aobj *aobj = (struct uvm_aobj *)uobj;

- struct vm_page *pg;

+ struct vm_page *pg, *next;

+ boolean_t busybody;

UVMHIST_FUNC("uao_detach"); UVMHIST_CALLED(maphist);

@@ -665,26 +666,35 @@ uao_detach_locked(struct uvm_object *uobj)

simple_unlock(&uao_list_lock);

- * Free all pages left in the object. If they're busy, wait

- * for them to become available before we kill it.

- * Release swap resources then free the page.

+ * free all the pages that aren't PG_BUSY,

+ * mark for release any that are.

- uvm_lock_pageq();

- while((pg = TAILQ_FIRST(&uobj->memq)) != NULL) {

+ busybody = FALSE;

+ for (pg = TAILQ_FIRST(&uobj->memq); pg != NULL; pg = next) {

+ next = TAILQ_NEXT(pg, listq);

if (pg->pg_flags & PG_BUSY) {

- atomic_setbits_int(&pg->pg_flags, PG_WANTED);

- uvm_unlock_pageq();

- UVM_UNLOCK_AND_WAIT(pg, &uobj->vmobjlock, 0,

- "uao_det", 0);

- simple_lock(&uobj->vmobjlock);

- uvm_lock_pageq();

+ atomic_setbits_int(&pg->pg_flags, PG_RELEASED);

+ busybody = TRUE;

continue;

}

+ /* zap the mappings, free the swap slot, free the page */

pmap_page_protect(pg, VM_PROT_NONE);

uao_dropswap(&aobj->u_obj, pg->offset >> PAGE_SHIFT);

+ uvm_lock_pageq();

uvm_pagefree(pg);

+ uvm_unlock_pageq();

+ }

+ /*

+ * if we found any busy pages, we're done for now.

+ * mark the aobj for death, releasepg will finish up for us.

+ */

+ if (busybody) {

+ aobj->u_flags |= UAO_FLAG_KILLME;

+ simple_unlock(&aobj->u_obj.vmobjlock);

+ return;

}

- uvm_unlock_pageq();

* finally, free the rest.

@@ -716,6 +726,35 @@ uao_detach_locked(struct uvm_object *uobj)

* => we return TRUE unless we encountered some sort of I/O error

* XXXJRT currently never happens, as we never directly initiate

* XXXJRT I/O

+ *

+ * comment on "cleaning" object and PG_BUSY pages:

+ * this routine is holding the lock on the object. the only time

+ * that is can run into a PG_BUSY page that it does not own is if

+ * some other process has started I/O on the page (e.g. either

+ * a pagein or a pageout). if the PG_BUSY page is being paged

+ * in, then it can not be dirty (!PG_CLEAN) because no one has

+ * had a change to modify it yet. if the PG_BUSY page is being

+ * paged out then it means that someone else has already started

+ * cleaning the page for us (how nice!). in this case, if we

+ * have syncio specified, then after we make our pass through the

+ * object we need to wait for the other PG_BUSY pages to clear

+ * off (i.e. we need to do an iosync). also note that once a

+ * page is PG_BUSY is must stary in its object until it is un-busyed.

+ * XXXJRT We never actually do this, as we are "flushing" anonymous

+ * XXXJRT memory, which doesn't have persistent backing store.

+ *

+ * note on page traversal:

+ * we can traverse the pages in an object either by going down the

+ * linked list in "uobj->memq", or we can go over the address range

+ * by page doing hash table lookups for each address. depending

+ * on how many pages are in the object it may be cheaper to do one

+ * or the other. we set "by_list" to true if we are using memq.

+ * if the cost of a hash lookup was equal to the cost of the list

+ * traversal we could compare the number of pages in the start->stop

+ * range to the total number of pages in the object. however, it

+ * seems that a hash table lookup is more expensive than the linked

+ * list traversal, so we multiply the number of pages in the

+ * start->stop range by a penalty which we define below.

#define UAO_HASH_PENALTY 4 /* XXX: a guess */

@@ -724,13 +763,19 @@ boolean_t

uao_flush(struct uvm_object *uobj, voff_t start, voff_t stop, int flags)

{

struct uvm_aobj *aobj = (struct uvm_aobj *) uobj;

- struct vm_page *pp;

+ struct vm_page *pp, *ppnext;

+ boolean_t retval, by_list;

voff_t curoff;

UVMHIST_FUNC("uao_flush"); UVMHIST_CALLED(maphist);

+ curoff = 0; /* XXX: shut up gcc */

+ retval = TRUE; /* default to success */

if (flags & PGO_ALLPAGES) {

start = 0;

stop = aobj->u_pages << PAGE_SHIFT;

+ by_list = TRUE; /* always go by the list */

} else {

start = trunc_page(start);

stop = round_page(stop);

@@ -739,10 +784,13 @@ uao_flush(struct uvm_object *uobj, voff_t start, voff_t stop, int flags)

"flush (fixed)\n");

stop = aobj->u_pages << PAGE_SHIFT;

}

+ by_list = (uobj->uo_npages <=

+ ((stop - start) >> PAGE_SHIFT) * UAO_HASH_PENALTY);

}

- UVMHIST_LOG(maphist, " flush start=0x%lx, stop=0x%lx, flags=0x%lx",

- (u_long)start, (u_long)stop, flags, 0);

+ UVMHIST_LOG(maphist,

+ " flush start=0x%lx, stop=0x%lx, by_list=%ld, flags=0x%lx",

+ (u_long)start, (u_long)stop, by_list, flags);

* Don't need to do any work here if we're not freeing

@@ -751,31 +799,44 @@ uao_flush(struct uvm_object *uobj, voff_t start, voff_t stop, int flags)

if ((flags & (PGO_DEACTIVATE|PGO_FREE)) == 0) {

UVMHIST_LOG(maphist,

"<- done (no work to do)",0,0,0,0);

- return (TRUE);

+ return (retval);

}

- /* locked: uobj */

- curoff = start;

- for (;;) {

- if (curoff < stop) {

- pp = uvm_pagelookup(uobj, curoff);

- curoff += PAGE_SIZE;

- if (pp == NULL)

+ /*

+ * now do it. note: we must update ppnext in the body of loop or we

+ * will get stuck. we need to use ppnext because we may free "pp"

+ * before doing the next loop.

+ */

+ if (by_list) {

+ pp = TAILQ_FIRST(&uobj->memq);

+ } else {

+ curoff = start;

+ pp = uvm_pagelookup(uobj, curoff);

+ }

+ ppnext = NULL; /* XXX: shut up gcc */

+ uvm_lock_pageq(); /* page queues locked */

+ /* locked: both page queues and uobj */

+ for ( ; (by_list && pp != NULL) ||

+ (!by_list && curoff < stop) ; pp = ppnext) {

+ if (by_list) {

+ ppnext = TAILQ_NEXT(pp, listq);

+ /* range check */

+ if (pp->offset < start || pp->offset >= stop)

continue;

} else {

- break;

- }

+ curoff += PAGE_SIZE;

+ if (curoff < stop)

+ ppnext = uvm_pagelookup(uobj, curoff);

- /* Make sure page is unbusy, else wait for it. */

- if (pp->pg_flags & PG_BUSY) {

- atomic_setbits_int(&pp->pg_flags, PG_WANTED);

- UVM_UNLOCK_AND_WAIT(pp, &uobj->vmobjlock, 0,

- "uaoflsh", 0);

- simple_lock(&uobj->vmobjlock);

- curoff -= PAGE_SIZE;

- continue;

+ /* null check */

+ if (pp == NULL)

+ continue;

}

switch (flags & (PGO_CLEANIT|PGO_FREE|PGO_DEACTIVATE)) {

* XXX In these first 3 cases, we always just

@@ -784,9 +845,7 @@ uao_flush(struct uvm_object *uobj, voff_t start, voff_t stop, int flags)

* XXX in the future.

case PGO_CLEANIT|PGO_FREE:

- /* FALLTHROUGH */

case PGO_CLEANIT|PGO_DEACTIVATE:

- /* FALLTHROUGH */

case PGO_DEACTIVATE:

deactivate_it:

/* skip the page if it's loaned or wired */

@@ -794,13 +853,16 @@ uao_flush(struct uvm_object *uobj, voff_t start, voff_t stop, int flags)

pp->wire_count != 0)

continue;

- uvm_lock_pageq();

+#ifdef UBC

+ /* ...and deactivate the page. */

+ pmap_clear_reference(pp);

+#else

/* zap all mappings for the page. */

pmap_page_protect(pp, VM_PROT_NONE);

/* ...and deactivate the page. */

+#endif

uvm_pagedeactivate(pp);

- uvm_unlock_pageq();

continue;

@@ -817,13 +879,19 @@ uao_flush(struct uvm_object *uobj, voff_t start, voff_t stop, int flags)

pp->wire_count != 0)

continue;

+ /*

+ * mark the page as released if its busy.

+ */

+ if (pp->pg_flags & PG_BUSY) {

+ atomic_setbits_int(&pp->pg_flags, PG_RELEASED);

+ continue;

+ }

/* zap all mappings for the page. */

pmap_page_protect(pp, VM_PROT_NONE);

uao_dropswap(uobj, pp->offset >> PAGE_SHIFT);

- uvm_lock_pageq();

uvm_pagefree(pp);

- uvm_unlock_pageq();

continue;

@@ -832,9 +900,11 @@ uao_flush(struct uvm_object *uobj, voff_t start, voff_t stop, int flags)

}

+ uvm_unlock_pageq();

UVMHIST_LOG(maphist,

"<- done, rv=%ld",retval,0,0,0);

- return (TRUE);

+ return (retval);

}

@@ -916,10 +986,10 @@ uao_get(struct uvm_object *uobj, voff_t offset, struct vm_page **pps,

}

- * to be useful must get a non-busy page

+ * to be useful must get a non-busy, non-released page

if (ptmp == NULL ||

- (ptmp->pg_flags & PG_BUSY) != 0) {

+ (ptmp->pg_flags & (PG_BUSY|PG_RELEASED)) != 0) {

if (lcv == centeridx ||

(flags & PGO_ALLPAGES) != 0)

/* need to do a wait or I/O! */

@@ -1026,7 +1096,7 @@ uao_get(struct uvm_object *uobj, voff_t offset, struct vm_page **pps,

}

/* page is there, see if we need to wait on it */

- if ((ptmp->pg_flags & PG_BUSY) != 0) {

+ if ((ptmp->pg_flags & (PG_BUSY|PG_RELEASED)) != 0) {

atomic_setbits_int(&ptmp->pg_flags, PG_WANTED);

UVMHIST_LOG(pdhist,

"sleeping, ptmp->flags 0x%lx\n",

@@ -1065,7 +1135,8 @@ uao_get(struct uvm_object *uobj, voff_t offset, struct vm_page **pps,

* just zero the page if there's nothing in swap.

- if (swslot == 0) {

+ if (swslot == 0)

+ {

* page hasn't existed before, just zero it.

@@ -1176,7 +1247,27 @@ uao_releasepg(struct vm_page *pg, struct vm_page **nextpgp /* OUT */)

if (!nextpgp)

uvm_unlock_pageq(); /* keep locked for daemon */

- return TRUE;

+ /*

+ * if we're not killing the object, we're done.

+ */

+ if ((aobj->u_flags & UAO_FLAG_KILLME) == 0)

+ return TRUE;

+ KASSERT(aobj->u_obj.uo_refs == 0);

+ /*

+ * if there are still pages in the object, we're done for now.

+ */

+ if (aobj->u_obj.uo_npages != 0)

+ return TRUE;

+ KASSERT(TAILQ_EMPTY(&aobj->u_obj.memq));

+ /*

+ * finally, free the rest.

+ */

+ uao_free(aobj);

+ return FALSE;

}

@@ -1382,6 +1473,7 @@ uao_pagein_page(struct uvm_aobj *aobj, int pageidx)

return FALSE;

}

+ KASSERT((pg->pg_flags & PG_RELEASED) == 0);

* ok, we've got the page now.

diff --git a/sys/uvm/uvm_aobj.h b/sys/uvm/uvm_aobj.h
index b30e9026b07..b97281011dd 100644
--- a/sys/uvm/uvm_aobj.h
+++ b/sys/uvm/uvm_aobj.h

@@ -1,4 +1,4 @@

-/* $OpenBSD: uvm_aobj.h,v 1.9 2009/05/05 05:12:17 oga Exp $ */

+/* $OpenBSD: uvm_aobj.h,v 1.10 2009/06/16 17:14:15 oga Exp $ */

/* $NetBSD: uvm_aobj.h,v 1.10 2000/01/11 06:57:49 chs Exp $ */

@@ -55,6 +55,8 @@

#define UAO_FLAG_KERNSWAP 0x2 /* enable kernel swap */

/* internal flags */

+#define UAO_FLAG_KILLME 0x4 /* aobj should die when last released

+ * page is no longer PG_BUSY ... */

#define UAO_FLAG_NOSWAP 0x8 /* aobj can't swap (kernel obj only!) */

#ifdef _KERNEL

diff --git a/sys/uvm/uvm_km.c b/sys/uvm/uvm_km.c
index 895a9593173..962b41bfac1 100644
--- a/sys/uvm/uvm_km.c
+++ b/sys/uvm/uvm_km.c

@@ -1,4 +1,4 @@

-/* $OpenBSD: uvm_km.c,v 1.71 2009/05/05 05:27:53 oga Exp $ */

+/* $OpenBSD: uvm_km.c,v 1.72 2009/06/16 17:14:15 oga Exp $ */

/* $NetBSD: uvm_km.c,v 1.42 2001/01/14 02:10:01 thorpej Exp $ */

@@ -276,12 +276,8 @@ uvm_km_pgremove(struct uvm_object *uobj, vaddr_t start, vaddr_t end)

pp->pg_flags & PG_BUSY, 0, 0);

if (pp->pg_flags & PG_BUSY) {

- atomic_setbits_int(&pp->pg_flags, PG_WANTED);

- UVM_UNLOCK_AND_WAIT(pp, &uobj->vmobjlock, 0,

- "km_pgrm", 0);

- simple_lock(&uobj->vmobjlock);

- curoff -= PAGE_SIZE; /* loop back to us */

- continue;

+ /* owner must check for this when done */

+ atomic_setbits_int(&pp->pg_flags, PG_RELEASED);

} else {

/* free the swap slot... */

uao_dropswap(uobj, curoff >> PAGE_SHIFT);

@@ -515,6 +511,21 @@ uvm_km_alloc1(struct vm_map *map, vsize_t size, vsize_t align, boolean_t zeroit)

loopva = kva;

while (size) {

simple_lock(&uvm.kernel_object->vmobjlock);

+ pg = uvm_pagelookup(uvm.kernel_object, offset);

+ /*

+ * if we found a page in an unallocated region, it must be

+ * released

+ */

+ if (pg) {

+ if ((pg->pg_flags & PG_RELEASED) == 0)

+ panic("uvm_km_alloc1: non-released page");

+ atomic_setbits_int(&pg->pg_flags, PG_WANTED);

+ UVM_UNLOCK_AND_WAIT(pg, &uvm.kernel_object->vmobjlock,

+ FALSE, "km_alloc", 0);

+ continue; /* retry */

+ }

/* allocate ram */

pg = uvm_pagealloc(uvm.kernel_object, offset, NULL, 0);

if (pg) {

diff --git a/sys/uvm/uvm_pager.c b/sys/uvm/uvm_pager.c
index 292cc90e218..4aad8ee3738 100644
--- a/sys/uvm/uvm_pager.c
+++ b/sys/uvm/uvm_pager.c

@@ -1,4 +1,4 @@

-/* $OpenBSD: uvm_pager.c,v 1.51 2009/05/23 14:06:37 oga Exp $ */

+/* $OpenBSD: uvm_pager.c,v 1.52 2009/06/16 17:14:15 oga Exp $ */

/* $NetBSD: uvm_pager.c,v 1.36 2000/11/27 18:26:41 chs Exp $ */

@@ -339,8 +339,6 @@ uvm_pagermapout(vaddr_t kva, int npages)

* PGO_ALLPAGES: all pages in object are valid targets

* !PGO_ALLPAGES: use "lo" and "hi" to limit range of cluster

* PGO_DOACTCLUST: include active pages in cluster.

- * PGO_FREE: set the PG_RELEASED bits on the cluster so they'll be freed

- * in async io (caller must clean on error).

* NOTE: the caller should clear PG_CLEANCHK bits if PGO_DOACTCLUST.

* PG_CLEANCHK is only a hint, but clearing will help reduce

* the number of calls we make to the pmap layer.

@@ -442,14 +440,6 @@ uvm_mk_pcluster(struct uvm_object *uobj, struct vm_page **pps, int *npages,

atomic_setbits_int(&pclust->pg_flags, PG_BUSY);

UVM_PAGE_OWN(pclust, "uvm_mk_pcluster");

- /*

- * If we want to free after io is done, and we're

- * async, set the released flag

- */

- if ((flags & (PGO_FREE|PGO_SYNCIO)) == PGO_FREE)

- atomic_setbits_int(&pclust->pg_flags,

- PG_RELEASED);

/* XXX: protect wired page? see above comment. */

pmap_page_protect(pclust, VM_PROT_READ);

if (!forward) {

@@ -491,7 +481,6 @@ uvm_mk_pcluster(struct uvm_object *uobj, struct vm_page **pps, int *npages,

* PGO_DOACTCLUST: include "PQ_ACTIVE" pages as valid targets

* PGO_SYNCIO: do SYNC I/O (no async)

* PGO_PDFREECLUST: pagedaemon: drop cluster on successful I/O

- * PGO_FREE: tell the aio daemon to free pages in the async case.

* => start/stop: if (uobj && !PGO_ALLPAGES) limit targets to this range

* if (!uobj) start is the (daddr64_t) of the starting swapblk

* => return state:

@@ -715,6 +704,8 @@ uvm_pager_dropcluster(struct uvm_object *uobj, struct vm_page *pg,

struct vm_page **ppsp, int *npages, int flags)

{

int lcv;

+ boolean_t obj_is_alive;

+ struct uvm_object *saved_uobj;

* drop all pages but "pg"

@@ -756,8 +747,9 @@ uvm_pager_dropcluster(struct uvm_object *uobj, struct vm_page *pg,

}

/* if page was released, release it. otherwise un-busy it */

- if (ppsp[lcv]->pg_flags & PG_RELEASED &&

- ppsp[lcv]->pg_flags & PQ_ANON) {

+ if (ppsp[lcv]->pg_flags & PG_RELEASED) {

+ if (ppsp[lcv]->pg_flags & PQ_ANON) {

/* so that anfree will free */

atomic_clearbits_int(&ppsp[lcv]->pg_flags,

PG_BUSY);

@@ -769,13 +761,34 @@ uvm_pager_dropcluster(struct uvm_object *uobj, struct vm_page *pg,

uvm_anfree(ppsp[lcv]->uanon);

continue;

- } else {

+ }

- * if we were planning on async io then we would

- * have PG_RELEASED set, clear that with the others.

+ * pgo_releasepg will dump the page for us

+ saved_uobj = ppsp[lcv]->uobject;

+ obj_is_alive =

+ saved_uobj->pgops->pgo_releasepg(ppsp[lcv], NULL);

+ /* for normal objects, "pg" is still PG_BUSY by us,

+ * so obj can't die */

+ KASSERT(!uobj || obj_is_alive);

+ /* only unlock the object if it is still alive... */

+ if (obj_is_alive && saved_uobj != uobj)

+ simple_unlock(&saved_uobj->vmobjlock);

+ /*

+ * XXXCDC: suppose uobj died in the pgo_releasepg?

+ * how pass that

+ * info up to caller. we are currently ignoring it...

+ */

+ continue; /* next page */

+ } else {

atomic_clearbits_int(&ppsp[lcv]->pg_flags,

- PG_BUSY|PG_WANTED|PG_FAKE|PG_RELEASED);

+ PG_BUSY|PG_WANTED|PG_FAKE);

UVM_PAGE_OWN(ppsp[lcv], NULL);

}

@@ -799,6 +812,33 @@ uvm_pager_dropcluster(struct uvm_object *uobj, struct vm_page *pg,

}

+#ifdef UBC

+/*

+ * interrupt-context iodone handler for nested i/o bufs.

+ *

+ * => must be at splbio().

+ */

+void

+uvm_aio_biodone1(struct buf *bp)

+ struct buf *mbp = bp->b_private;

+ splassert(IPL_BIO);

+ KASSERT(mbp != bp);

+ if (bp->b_flags & B_ERROR) {

+ mbp->b_flags |= B_ERROR;

+ mbp->b_error = bp->b_error;

+ }

+ mbp->b_resid -= bp->b_bcount;

+ pool_put(&bufpool, bp);

+ if (mbp->b_resid == 0) {

+ biodone(mbp);

+ }

+#endif

* interrupt-context iodone handler for single-buf i/os

* or the top-level buf of a nested-buf i/o.

diff --git a/sys/uvm/uvm_vnode.c b/sys/uvm/uvm_vnode.c
index 998f0fa0a62..439cfe03712 100644
--- a/sys/uvm/uvm_vnode.c
+++ b/sys/uvm/uvm_vnode.c

@@ -1,4 +1,4 @@

-/* $OpenBSD: uvm_vnode.c,v 1.64 2009/06/16 16:42:41 ariane Exp $ */

+/* $OpenBSD: uvm_vnode.c,v 1.65 2009/06/16 17:14:15 oga Exp $ */

/* $NetBSD: uvm_vnode.c,v 1.36 2000/11/24 20:34:01 chs Exp $ */

@@ -415,23 +415,30 @@ uvn_detach(struct uvm_object *uobj)

* given the structure of this pager, the above flush request will

* create the following state: all the pages that were in the object

- * have either been free'd or they are marked PG_BUSY and in the

- * middle of an async io. If we still have pages we set the "relkill"

- * state, so that in the case the vnode gets terminated we know

- * to leave it alone. Otherwise we'll kill the vnode when it's empty.

+ * have either been free'd or they are marked PG_BUSY|PG_RELEASED.

+ * the PG_BUSY bit was set either by us or the daemon for async I/O.

+ * in either case, if we have pages left we can't kill the object

+ * yet because i/o is pending. in this case we set the "relkill"

+ * flag which will cause pgo_releasepg to kill the object once all

+ * the I/O's are done [pgo_releasepg will be called from the aiodone

+ * routine or from the page daemon].

- uvn->u_flags |= UVM_VNODE_RELKILL;

- /* wait on any outstanding io */

- while (uobj->uo_npages && uvn->u_flags & UVM_VNODE_RELKILL) {

- uvn->u_flags |= UVM_VNODE_IOSYNC;

- UVM_UNLOCK_AND_WAIT(&uvn->u_nio, &uvn->u_obj.vmobjlock, FALSE,

- "uvn_term",0);

- simple_lock(&uvn->u_obj.vmobjlock);

- }

- if ((uvn->u_flags & UVM_VNODE_RELKILL) == 0)

+ if (uobj->uo_npages) { /* I/O pending. iodone will free */

+#ifdef DEBUG

+ /*

+ * XXXCDC: very unlikely to happen until we have async i/o

+ * so print a little info message in case it does.

+ */

+ printf("uvn_detach: vn %p has pages left after flush - "

+ "relkill mode\n", uobj);

+#endif

+ uvn->u_flags |= UVM_VNODE_RELKILL;

+ simple_unlock(&uobj->vmobjlock);

+ UVMHIST_LOG(maphist,"<- done! (releasepg will kill obj)", 0, 0,

+ 0, 0);

return;

+ }

* kill object now. note that we can't be on the sync q because

@@ -483,6 +490,8 @@ uvn_detach(struct uvm_object *uobj)

* => the caller must XLOCK and VOP_LOCK the vnode before calling us

* [protects us from getting a vnode that is already in the DYING

* state...]

+ * => unlike uvn_detach, this function must not return until all the

+ * uvn's pages are disposed of.

* => in case [2] the uvn is still alive after this call, but all I/O

* ops will fail (due to the backing vnode now being "dead"). this

* will prob. kill any process using the uvn due to pgo_get failing.

@@ -640,7 +649,12 @@ uvm_vnp_terminate(struct vnode *vp)

boolean_t

uvn_releasepg(struct vm_page *pg, struct vm_page **nextpgp /* OUT */)

{

- KASSERT(pg->pg_flags & PG_RELEASED);

+ struct uvm_vnode *uvn = (struct uvm_vnode *) pg->uobject;

+ struct vnode *vp = (struct vnode *)uvn;

+#ifdef DIAGNOSTIC

+ if ((pg->pg_flags & PG_RELEASED) == 0)

+ panic("uvn_releasepg: page not released!");

+#endif

* dispose of the page [caller handles PG_WANTED]

@@ -652,6 +666,32 @@ uvn_releasepg(struct vm_page *pg, struct vm_page **nextpgp /* OUT */)

uvm_pagefree(pg);

if (!nextpgp)

uvm_unlock_pageq();

+ /*

+ * now see if we need to kill the object

+ */

+ if (uvn->u_flags & UVM_VNODE_RELKILL) {

+ if (uvn->u_obj.uo_refs)

+ panic("uvn_releasepg: kill flag set on referenced "

+ "object!");

+ if (uvn->u_obj.uo_npages == 0) {

+ if (uvn->u_flags & UVM_VNODE_WRITEABLE) {

+ LIST_REMOVE(uvn, u_wlist);

+ }

+#ifdef DIAGNOSTIC

+ if (!TAILQ_EMPTY(&uvn->u_obj.memq))

+ panic("uvn_releasepg: pages in object with npages == 0");

+#endif

+ if (uvn->u_flags & UVM_VNODE_WANTED)

+ /* still holding object lock */

+ wakeup(uvn);

+ uvn->u_flags = 0; /* DEAD! */

+ simple_unlock(&uvn->u_obj.vmobjlock);

+ vrele(vp);

+ return (FALSE);

+ }

return (TRUE);

}

@@ -752,13 +792,15 @@ boolean_t

uvn_flush(struct uvm_object *uobj, voff_t start, voff_t stop, int flags)

{

struct uvm_vnode *uvn = (struct uvm_vnode *) uobj;

- struct vm_page *pp, *ptmp;

+ struct vm_page *pp, *ppnext, *ptmp;

struct vm_page *pps[MAXBSIZE >> PAGE_SHIFT], **ppsp;

int npages, result, lcv;

- boolean_t retval, need_iosync, needs_clean;

+ boolean_t retval, need_iosync, by_list, needs_clean, all;

voff_t curoff;

+ u_short pp_version;

UVMHIST_FUNC("uvn_flush"); UVMHIST_CALLED(maphist);

+ curoff = 0; /* XXX: shut up gcc */

* get init vals and determine how we are going to traverse object

@@ -766,16 +808,24 @@ uvn_flush(struct uvm_object *uobj, voff_t start, voff_t stop, int flags)

need_iosync = FALSE;

retval = TRUE; /* return value */

if (flags & PGO_ALLPAGES) {

- start = 0;

- stop = round_page(uvn->u_size);

+ all = TRUE;

+ by_list = TRUE; /* always go by the list */

} else {

start = trunc_page(start);

- stop = MIN(round_page(stop), round_page(uvn->u_size));

+ stop = round_page(stop);

+#ifdef DEBUG

+ if (stop > round_page(uvn->u_size))

+ printf("uvn_flush: strange, got an out of range "

+ "flush (fixed)\n");

+#endif

+ all = FALSE;

+ by_list = (uobj->uo_npages <=

+ ((stop - start) >> PAGE_SHIFT) * UVN_HASH_PENALTY);

}

UVMHIST_LOG(maphist,

- " flush start=0x%lx, stop=0x%lx, flags=0x%lx",

- (u_long)start, (u_long)stop, flags, 0);

+ " flush start=0x%lx, stop=0x%lx, by_list=%ld, flags=0x%lx",

+ (u_long)start, (u_long)stop, by_list, flags);

* PG_CLEANCHK: this bit is used by the pgo_mk_pcluster function as

@@ -788,21 +838,75 @@ uvn_flush(struct uvm_object *uobj, voff_t start, voff_t stop, int flags)

* [borrowed PG_CLEANCHK idea from FreeBSD VM]

- if ((flags & PGO_CLEANIT) != 0) {

- KASSERT(uobj->pgops->pgo_mk_pcluster != 0);

- for (curoff = start ; curoff < stop; curoff += PAGE_SIZE) {

- if ((pp = uvm_pagelookup(uobj, curoff)) != NULL)

+ if ((flags & PGO_CLEANIT) != 0 &&

+ uobj->pgops->pgo_mk_pcluster != NULL) {

+ if (by_list) {

+ TAILQ_FOREACH(pp, &uobj->memq, listq) {

+ if (!all &&

+ (pp->offset < start || pp->offset >= stop))

+ continue;

atomic_clearbits_int(&pp->pg_flags,

PG_CLEANCHK);

+ }

+ } else { /* by hash */

+ for (curoff = start ; curoff < stop;

+ curoff += PAGE_SIZE) {

+ pp = uvm_pagelookup(uobj, curoff);

+ if (pp)

+ atomic_clearbits_int(&pp->pg_flags,

+ PG_CLEANCHK);

+ }

}

+ /*

+ * now do it. note: we must update ppnext in body of loop or we

+ * will get stuck. we need to use ppnext because we may free "pp"

+ * before doing the next loop.

+ */

+ if (by_list) {

+ pp = TAILQ_FIRST(&uobj->memq);

+ } else {

+ curoff = start;

+ pp = uvm_pagelookup(uobj, curoff);

+ }

+ ppnext = NULL; /* XXX: shut up gcc */

ppsp = NULL; /* XXX: shut up gcc */

uvm_lock_pageq(); /* page queues locked */

/* locked: both page queues and uobj */

- for (curoff = start; curoff < stop; curoff += PAGE_SIZE) {

- if ((pp = uvm_pagelookup(uobj, curoff)) == NULL)

- continue;

+ for ( ; (by_list && pp != NULL) ||

+ (!by_list && curoff < stop) ; pp = ppnext) {

+ if (by_list) {

+ /*

+ * range check

+ */

+ if (!all &&

+ (pp->offset < start || pp->offset >= stop)) {

+ ppnext = TAILQ_NEXT(pp, listq);

+ continue;

+ }

+ } else {

+ /*

+ * null check

+ */

+ curoff += PAGE_SIZE;

+ if (pp == NULL) {

+ if (curoff < stop)

+ ppnext = uvm_pagelookup(uobj, curoff);

+ continue;

+ }

* handle case where we do not need to clean page (either

@@ -839,32 +943,37 @@ uvn_flush(struct uvm_object *uobj, voff_t start, voff_t stop, int flags)

}

- * if we don't need a clean... deactivate/free pages then cont.

+ * if we don't need a clean... load ppnext and dispose of pp

if (!needs_clean) {

+ /* load ppnext */

+ if (by_list)

+ ppnext = TAILQ_NEXT(pp, listq);

+ else {

+ if (curoff < stop)

+ ppnext = uvm_pagelookup(uobj, curoff);

+ }

+ /* now dispose of pp */

if (flags & PGO_DEACTIVATE) {

if ((pp->pg_flags & PQ_INACTIVE) == 0 &&

pp->wire_count == 0) {

pmap_page_protect(pp, VM_PROT_NONE);

uvm_pagedeactivate(pp);

}

} else if (flags & PGO_FREE) {

if (pp->pg_flags & PG_BUSY) {

+ /* release busy pages */

atomic_setbits_int(&pp->pg_flags,

- PG_WANTED);

- uvm_unlock_pageq();

- UVM_UNLOCK_AND_WAIT(pp,

- &uobj->vmobjlock, 0, "uvn_flsh", 0);

- simple_lock(&uobj->vmobjlock);

- uvm_lock_pageq();

- curoff -= PAGE_SIZE;

- continue;

+ PG_RELEASED);

} else {

pmap_page_protect(pp, VM_PROT_NONE);

/* removed page from object */

uvm_pagefree(pp);

}

+ /* ppnext is valid so we can continue... */

continue;

}

@@ -880,9 +989,7 @@ uvn_flush(struct uvm_object *uobj, voff_t start, voff_t stop, int flags)

atomic_setbits_int(&pp->pg_flags, PG_BUSY);

UVM_PAGE_OWN(pp, "uvn_flush");

pmap_page_protect(pp, VM_PROT_READ);

- /* if we're async, free the page in aiodoned */

- if ((flags & (PGO_FREE|PGO_SYNCIO)) == PGO_FREE)

- atomic_setbits_int(&pp->pg_flags, PG_RELEASED);

+ pp_version = pp->pg_version;

ReTry:

ppsp = pps;

npages = sizeof(pps) / sizeof(struct vm_page *);

@@ -893,11 +1000,11 @@ ReTry:

/* unlocked: page queues, uobj */

- * if we did an async I/O it is remotely possible for the

- * async i/o to complete and the page "pp" be freed or what

- * not before we get a chance to relock the object. Therefore,

- * we only touch it when it won't be freed, RELEASED took care

- * of the rest.

+ * at this point nothing is locked. if we did an async I/O

+ * it is remotely possible for the async i/o to complete and

+ * the page "pp" be freed or what not before we get a chance

+ * to relock the object. in order to detect this, we have

+ * saved the version number of the page in "pp_version".

/* relock! */

@@ -906,7 +1013,7 @@ ReTry:

* VM_PAGER_AGAIN: given the structure of this pager, this

- * can only happen when we are doing async I/O and can't

+ * can only happen when we are doing async I/O and can't

* map the pages into kernel memory (pager_map) due to lack

* of vm space. if this happens we drop back to sync I/O.

@@ -924,10 +1031,6 @@ ReTry:

panic("uvn_flush: PGO_SYNCIO return 'try again' error (impossible)");

#endif

flags |= PGO_SYNCIO;

- if (flags & PGO_FREE)

- atomic_clearbits_int(&pp->pg_flags,

- PG_RELEASED);

goto ReTry;

}

@@ -939,20 +1042,66 @@ ReTry:

- * for pending async i/o if we are not deactivating

- * we can move on to the next page. aiodoned deals with

- * the freeing case for us.

+ * for pending async i/o if we are not deactivating/freeing

+ * we can move on to the next page.

- if (result == VM_PAGER_PEND && (flags & PGO_DEACTIVATE) == 0)

- continue;

+ if (result == VM_PAGER_PEND) {

+ if ((flags & (PGO_DEACTIVATE|PGO_FREE)) == 0) {

+ /*

+ * no per-page ops: refresh ppnext and continue

+ */

+ if (by_list) {

+ if (pp->pg_version == pp_version)

+ ppnext = TAILQ_NEXT(pp, listq);

+ else

+ /* reset */

+ ppnext = TAILQ_FIRST(&uobj->memq);

+ } else {

+ if (curoff < stop)

+ ppnext = uvm_pagelookup(uobj,

+ curoff);

+ }

+ continue;

+ }

+ /* need to do anything here? */

+ }

- * need to look at each page of the I/O operation, and do what

- * we gotta do.

+ * need to look at each page of the I/O operation. we defer

+ * processing "pp" until the last trip through this "for" loop

+ * so that we can load "ppnext" for the main loop after we

+ * play with the cluster pages [thus the "npages + 1" in the

+ * loop below].

- for (lcv = 0 ; lcv < npages; lcv++) {

- ptmp = ppsp[lcv];

+ for (lcv = 0 ; lcv < npages + 1 ; lcv++) {

+ /*

+ * handle ppnext for outside loop, and saving pp

+ * until the end.

+ */

+ if (lcv < npages) {

+ if (ppsp[lcv] == pp)

+ continue; /* skip pp until the end */

+ ptmp = ppsp[lcv];

+ } else {

+ ptmp = pp;

+ /* set up next page for outer loop */

+ if (by_list) {

+ if (pp->pg_version == pp_version)

+ ppnext = TAILQ_NEXT(pp, listq);

+ else

+ /* reset */

+ ppnext = TAILQ_FIRST(&uobj->memq);

+ } else {

+ if (curoff < stop)

+ ppnext = uvm_pagelookup(uobj, curoff);

+ }

* verify the page didn't get moved while obj was

@@ -976,10 +1125,25 @@ ReTry:

atomic_clearbits_int(&ptmp->pg_flags,

PG_WANTED|PG_BUSY);

UVM_PAGE_OWN(ptmp, NULL);

- atomic_setbits_int(&ptmp->pg_flags,

- PG_CLEAN|PG_CLEANCHK);

- if ((flags & PGO_FREE) == 0)

- pmap_clear_modify(ptmp);

+ if (ptmp->pg_flags & PG_RELEASED) {

+ /*

+ * pgo_releasepg needs to grab the

+ * pageq lock itself.

+ */

+ uvm_unlock_pageq();

+ if (!uvn_releasepg(ptmp, NULL))

+ return (TRUE);

+ uvm_lock_pageq(); /* relock */

+ continue; /* next page */

+ } else {

+ atomic_setbits_int(&ptmp->pg_flags,

+ PG_CLEAN|PG_CLEANCHK);

+ if ((flags & PGO_FREE) == 0)

+ pmap_clear_modify(ptmp);

+ }

}

@@ -992,21 +1156,29 @@ ReTry:

pmap_page_protect(ptmp, VM_PROT_NONE);

uvm_pagedeactivate(ptmp);

}

- } else if (flags & PGO_FREE &&

- result != VM_PAGER_PEND) {

- if (result != VM_PAGER_OK) {

- printf("uvn_flush: obj=%p, "

- "offset=0x%llx. error "

- "during pageout.\n",

- pp->uobject,

- (long long)pp->offset);

- printf("uvn_flush: WARNING: "

- "changes to page may be "

- "lost!\n");

- retval = FALSE;

+ } else if (flags & PGO_FREE) {

+ if (result == VM_PAGER_PEND) {

+ if ((ptmp->pg_flags & PG_BUSY) != 0)

+ /* signal for i/o done */

+ atomic_setbits_int(

+ &ptmp->pg_flags,

+ PG_RELEASED);

+ } else {

+ if (result != VM_PAGER_OK) {

+ printf("uvn_flush: obj=%p, "

+ "offset=0x%llx. error "

+ "during pageout.\n",

+ pp->uobject,

+ (long long)pp->offset);

+ printf("uvn_flush: WARNING: "

+ "changes to page may be "

+ "lost!\n");

+ retval = FALSE;

+ }

+ pmap_page_protect(ptmp, VM_PROT_NONE);

+ uvm_pagefree(ptmp);

}

- pmap_page_protect(ptmp, VM_PROT_NONE);

- uvm_pagefree(ptmp);

}

} /* end of "lcv" for loop */

@@ -1149,7 +1321,7 @@ uvn_get(struct uvm_object *uobj, voff_t offset, struct vm_page **pps,

/* to be useful must get a non-busy, non-released pg */

if (ptmp == NULL ||

- (ptmp->pg_flags & PG_BUSY) != 0) {

+ (ptmp->pg_flags & (PG_BUSY|PG_RELEASED)) != 0) {

if (lcv == centeridx || (flags & PGO_ALLPAGES)

!= 0)

done = FALSE; /* need to do a wait or I/O! */

@@ -1255,7 +1427,7 @@ uvn_get(struct uvm_object *uobj, voff_t offset, struct vm_page **pps,

}

/* page is there, see if we need to wait on it */

- if ((ptmp->pg_flags & PG_BUSY) != 0) {

+ if ((ptmp->pg_flags & (PG_BUSY|PG_RELEASED)) != 0) {

atomic_setbits_int(&ptmp->pg_flags, PG_WANTED);

UVM_UNLOCK_AND_WAIT(ptmp,

&uobj->vmobjlock, FALSE, "uvn_get",0);