diff options
author | Ariane van der Steldt <ariane@cvs.openbsd.org> | 2009-06-01 17:42:34 +0000 |
---|---|---|
committer | Ariane van der Steldt <ariane@cvs.openbsd.org> | 2009-06-01 17:42:34 +0000 |
commit | d30afc0ec38415711bc30130e9412a6026468e8b (patch) | |
tree | ee3b3b40a267f69f54dca2c401c95db8de083c91 | |
parent | f5deafb272a62d5cf541c0d5ded06c603823ad2f (diff) |
physmem allocator: change the view of free memory from single free pages
to free ranges.
Classify memory based on region with associated use-counter (which is used
to construct a priority list of where to allocate memory).
Based on code from tedu@, help from many.
Ok art@
-rw-r--r-- | sys/arch/amd64/amd64/pmap.c | 10 | ||||
-rw-r--r-- | sys/arch/amd64/include/vmparam.h | 9 | ||||
-rw-r--r-- | sys/arch/i386/i386/pmap.c | 11 | ||||
-rw-r--r-- | sys/arch/i386/i386/pmapae.c | 15 | ||||
-rw-r--r-- | sys/arch/i386/include/vmparam.h | 9 | ||||
-rw-r--r-- | sys/conf/files | 3 | ||||
-rw-r--r-- | sys/uvm/uvm.h | 31 | ||||
-rw-r--r-- | sys/uvm/uvm_extern.h | 9 | ||||
-rw-r--r-- | sys/uvm/uvm_map.c | 15 | ||||
-rw-r--r-- | sys/uvm/uvm_page.c | 173 | ||||
-rw-r--r-- | sys/uvm/uvm_page.h | 17 | ||||
-rw-r--r-- | sys/uvm/uvm_pglist.c | 328 | ||||
-rw-r--r-- | sys/uvm/uvm_pmemrange.c | 1248 | ||||
-rw-r--r-- | sys/uvm/uvm_pmemrange.h | 83 | ||||
-rw-r--r-- | sys/uvm/uvm_vnode.c | 4 | ||||
-rw-r--r-- | sys/xfs/xfs_vnodeops-bsd.c | 2 |
16 files changed, 1503 insertions, 464 deletions
diff --git a/sys/arch/amd64/amd64/pmap.c b/sys/arch/amd64/amd64/pmap.c index 4bd4ba51f9f..fb46e417f84 100644 --- a/sys/arch/amd64/amd64/pmap.c +++ b/sys/arch/amd64/amd64/pmap.c @@ -1,4 +1,4 @@ -/* $OpenBSD: pmap.c,v 1.42 2009/05/28 09:05:33 art Exp $ */ +/* $OpenBSD: pmap.c,v 1.43 2009/06/01 17:42:33 ariane Exp $ */ /* $NetBSD: pmap.c,v 1.3 2003/05/08 18:13:13 thorpej Exp $ */ /* @@ -834,7 +834,7 @@ pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level, pmap->pm_ptphint[lidx] = TAILQ_FIRST(&obj->memq); ptp->wire_count = 0; uvm_pagerealloc(ptp, NULL, 0); - TAILQ_INSERT_TAIL(pagelist, ptp, listq); + TAILQ_INSERT_TAIL(pagelist, ptp, fq.queues.listq); } void @@ -1545,7 +1545,7 @@ pmap_do_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva, int flags) PMAP_MAP_TO_HEAD_UNLOCK(); while ((ptp = TAILQ_FIRST(&empty_ptps)) != NULL) { - TAILQ_REMOVE(&empty_ptps, ptp, listq); + TAILQ_REMOVE(&empty_ptps, ptp, fq.queues.listq); uvm_pagefree(ptp); } @@ -1617,7 +1617,7 @@ pmap_do_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva, int flags) PMAP_MAP_TO_HEAD_UNLOCK(); while ((ptp = TAILQ_FIRST(&empty_ptps)) != NULL) { - TAILQ_REMOVE(&empty_ptps, ptp, listq); + TAILQ_REMOVE(&empty_ptps, ptp, fq.queues.listq); uvm_pagefree(ptp); } } @@ -1690,7 +1690,7 @@ pmap_page_remove(struct vm_page *pg) pmap_tlb_shootwait(); while ((ptp = TAILQ_FIRST(&empty_ptps)) != NULL) { - TAILQ_REMOVE(&empty_ptps, ptp, listq); + TAILQ_REMOVE(&empty_ptps, ptp, fq.queues.listq); uvm_pagefree(ptp); } } diff --git a/sys/arch/amd64/include/vmparam.h b/sys/arch/amd64/include/vmparam.h index fd82b226db2..d3c5c9dd102 100644 --- a/sys/arch/amd64/include/vmparam.h +++ b/sys/arch/amd64/include/vmparam.h @@ -1,4 +1,4 @@ -/* $OpenBSD: vmparam.h,v 1.10 2008/07/18 16:40:17 kurt Exp $ */ +/* $OpenBSD: vmparam.h,v 1.11 2009/06/01 17:42:33 ariane Exp $ */ /* $NetBSD: vmparam.h,v 1.1 2003/04/26 18:39:49 fvdl Exp $ */ /*- @@ -112,6 +112,13 @@ #define VM_FREELIST_LOW 1 #define VM_FREELIST_HIGH 2 +/* reserve ISA-DMA and 32-bit DMA memory */ +#define UVM_IO_RANGES \ + { \ + { 0, 0x00ffffffUL }, \ + { 0, 0xffffffffUL }, \ + } + #define __HAVE_VM_PAGE_MD struct pv_entry; struct vm_page_md { diff --git a/sys/arch/i386/i386/pmap.c b/sys/arch/i386/i386/pmap.c index 753298eb42a..91fd6edb555 100644 --- a/sys/arch/i386/i386/pmap.c +++ b/sys/arch/i386/i386/pmap.c @@ -1,4 +1,4 @@ -/* $OpenBSD: pmap.c,v 1.136 2009/02/05 01:13:21 oga Exp $ */ +/* $OpenBSD: pmap.c,v 1.137 2009/06/01 17:42:33 ariane Exp $ */ /* $NetBSD: pmap.c,v 1.91 2000/06/02 17:46:37 thorpej Exp $ */ /* @@ -2074,7 +2074,7 @@ pmap_do_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva, int flags) /* If PTP is no longer being used, free it. */ if (ptp && ptp->wire_count <= 1) { pmap_drop_ptp(pmap, va, ptp, ptes); - TAILQ_INSERT_TAIL(&empty_ptps, ptp, listq); + TAILQ_INSERT_TAIL(&empty_ptps, ptp, fq.queues.listq); } if (!shootall) @@ -2088,7 +2088,7 @@ pmap_do_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva, int flags) pmap_unmap_ptes(pmap); PMAP_MAP_TO_HEAD_UNLOCK(); while ((ptp = TAILQ_FIRST(&empty_ptps)) != NULL) { - TAILQ_REMOVE(&empty_ptps, ptp, listq); + TAILQ_REMOVE(&empty_ptps, ptp, fq.queues.listq); uvm_pagefree(ptp); } } @@ -2145,7 +2145,8 @@ pmap_page_remove(struct vm_page *pg) if (pve->pv_ptp && --pve->pv_ptp->wire_count <= 1) { pmap_drop_ptp(pve->pv_pmap, pve->pv_va, pve->pv_ptp, ptes); - TAILQ_INSERT_TAIL(&empty_ptps, pve->pv_ptp, listq); + TAILQ_INSERT_TAIL(&empty_ptps, pve->pv_ptp, + fq.queues.listq); } pmap_tlb_shootpage(pve->pv_pmap, pve->pv_va); @@ -2158,7 +2159,7 @@ pmap_page_remove(struct vm_page *pg) pmap_tlb_shootwait(); while ((ptp = TAILQ_FIRST(&empty_ptps)) != NULL) { - TAILQ_REMOVE(&empty_ptps, ptp, listq); + TAILQ_REMOVE(&empty_ptps, ptp, fq.queues.listq); uvm_pagefree(ptp); } } diff --git a/sys/arch/i386/i386/pmapae.c b/sys/arch/i386/i386/pmapae.c index 683d282379a..b13ff7c9463 100644 --- a/sys/arch/i386/i386/pmapae.c +++ b/sys/arch/i386/i386/pmapae.c @@ -1,4 +1,4 @@ -/* $OpenBSD: pmapae.c,v 1.15 2009/01/27 22:14:13 miod Exp $ */ +/* $OpenBSD: pmapae.c,v 1.16 2009/06/01 17:42:33 ariane Exp $ */ /* * Copyright (c) 2006 Michael Shalayeff @@ -1453,14 +1453,15 @@ pmap_remove_pae(struct pmap *pmap, vaddr_t sva, vaddr_t eva) ptp->wire_count = 0; /* Postpone free to after shootdown. */ uvm_pagerealloc(ptp, NULL, 0); - TAILQ_INSERT_TAIL(&empty_ptps, ptp, listq); + TAILQ_INSERT_TAIL(&empty_ptps, ptp, + fq.queues.listq); } } pmap_tlb_shootnow(cpumask); pmap_unmap_ptes_pae(pmap); /* unlock pmap */ PMAP_MAP_TO_HEAD_UNLOCK(); while ((ptp = TAILQ_FIRST(&empty_ptps)) != NULL) { - TAILQ_REMOVE(&empty_ptps, ptp, listq); + TAILQ_REMOVE(&empty_ptps, ptp, fq.queues.listq); uvm_pagefree(ptp); } return; @@ -1546,7 +1547,7 @@ pmap_remove_pae(struct pmap *pmap, vaddr_t sva, vaddr_t eva) ptp->wire_count = 0; /* Postpone free to after shootdown. */ uvm_pagerealloc(ptp, NULL, 0); - TAILQ_INSERT_TAIL(&empty_ptps, ptp, listq); + TAILQ_INSERT_TAIL(&empty_ptps, ptp, fq.queues.listq); } } @@ -1554,7 +1555,7 @@ pmap_remove_pae(struct pmap *pmap, vaddr_t sva, vaddr_t eva) pmap_unmap_ptes_pae(pmap); PMAP_MAP_TO_HEAD_UNLOCK(); while ((ptp = TAILQ_FIRST(&empty_ptps)) != NULL) { - TAILQ_REMOVE(&empty_ptps, ptp, listq); + TAILQ_REMOVE(&empty_ptps, ptp, fq.queues.listq); uvm_pagefree(ptp); } } @@ -1665,7 +1666,7 @@ pmap_page_remove_pae(struct vm_page *pg) /* Postpone free to after shootdown. */ uvm_pagerealloc(pve->pv_ptp, NULL, 0); TAILQ_INSERT_TAIL(&empty_ptps, pve->pv_ptp, - listq); + fq.queues.listq); } } pmap_unmap_ptes_pae(pve->pv_pmap); /* unlocks pmap */ @@ -1676,7 +1677,7 @@ pmap_page_remove_pae(struct vm_page *pg) PMAP_HEAD_TO_MAP_UNLOCK(); pmap_tlb_shootnow(cpumask); while ((ptp = TAILQ_FIRST(&empty_ptps)) != NULL) { - TAILQ_REMOVE(&empty_ptps, ptp, listq); + TAILQ_REMOVE(&empty_ptps, ptp, fq.queues.listq); uvm_pagefree(ptp); } } diff --git a/sys/arch/i386/include/vmparam.h b/sys/arch/i386/include/vmparam.h index 9c2163e6c2d..41e95b3f418 100644 --- a/sys/arch/i386/include/vmparam.h +++ b/sys/arch/i386/include/vmparam.h @@ -1,4 +1,4 @@ -/* $OpenBSD: vmparam.h,v 1.41 2008/07/18 16:40:17 kurt Exp $ */ +/* $OpenBSD: vmparam.h,v 1.42 2009/06/01 17:42:33 ariane Exp $ */ /* $NetBSD: vmparam.h,v 1.15 1994/10/27 04:16:34 cgd Exp $ */ /*- @@ -118,6 +118,13 @@ #define VM_FREELIST_DEFAULT 0 #define VM_FREELIST_FIRST16 1 +/* reserve ISA-DMA and 32-bit DMA memory */ +#define UVM_IO_RANGES \ + { \ + { 0, 0x00ffffffUL }, \ + { 0, 0xffffffffUL }, \ + } + #define __HAVE_VM_PAGE_MD struct pv_entry; struct vm_page_md { diff --git a/sys/conf/files b/sys/conf/files index 38d4e2014c2..826593c211d 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -1,4 +1,4 @@ -# $OpenBSD: files,v 1.455 2009/05/06 18:21:23 stevesk Exp $ +# $OpenBSD: files,v 1.456 2009/06/01 17:42:33 ariane Exp $ # $NetBSD: files,v 1.87 1996/05/19 17:17:50 jonathan Exp $ # @(#)files.newconf 7.5 (Berkeley) 5/10/93 @@ -971,6 +971,7 @@ file uvm/uvm_page.c file uvm/uvm_pager.c file uvm/uvm_pdaemon.c file uvm/uvm_pglist.c +file uvm/uvm_pmemrange.c file uvm/uvm_stat.c file uvm/uvm_swap.c file uvm/uvm_swap_encrypt.c uvm_swap_encrypt diff --git a/sys/uvm/uvm.h b/sys/uvm/uvm.h index 3efcf89044c..087add79376 100644 --- a/sys/uvm/uvm.h +++ b/sys/uvm/uvm.h @@ -1,4 +1,4 @@ -/* $OpenBSD: uvm.h,v 1.32 2009/05/04 18:08:06 oga Exp $ */ +/* $OpenBSD: uvm.h,v 1.33 2009/06/01 17:42:33 ariane Exp $ */ /* $NetBSD: uvm.h,v 1.24 2000/11/27 08:40:02 chs Exp $ */ /* @@ -57,6 +57,7 @@ #include <uvm/uvm_page.h> #include <uvm/uvm_pager.h> #include <uvm/uvm_pdaemon.h> +#include <uvm/uvm_pmemrange.h> #include <uvm/uvm_swap.h> #ifdef UVM_SWAP_ENCRYPT #include <uvm/uvm_swap_encrypt.h> @@ -68,6 +69,32 @@ #include <machine/vmparam.h> /* + * UVM_IO_RANGES: paddr_t pairs, describing the lowest and highest address + * that should be reserved. These ranges (which may overlap) will have their + * use counter increased, causing them to be avoided if an allocation can be + * satisfied from another range of memory. + * + * IO ranges need not overlap with physmem ranges: the uvm code splits ranges + * on demand to satisfy requests. + * + * UVM_IO_RANGES specified here actually translates into a call to + * uvm_pmr_use_inc() at uvm initialization time. uvm_pmr_use_inc() can also + * be called after uvm_init() has completed. + * + * Note: the upper bound is specified in the same way as to uvm_pglistalloc. + * Ex: a memory range of 16 bit is specified as: { 0, 0xffff }. + */ +#ifndef UVM_IO_RANGES +#define UVM_IO_RANGES {} +#endif + +/* UVM IO ranges are described in an array of uvm_io_ranges. */ +struct uvm_io_ranges { + paddr_t low; + paddr_t high; +}; + +/* * uvm structure (vm global state: collected in one structure for ease * of reference...) */ @@ -76,7 +103,7 @@ struct uvm { /* vm_page related parameters */ /* vm_page queues */ - struct pgfreelist page_free[VM_NFREELIST]; /* unallocated pages */ + struct uvm_pmr_control pmr_control; /* pmemrange control data */ struct pglist page_active; /* allocated pages, in use */ struct pglist page_inactive_swp;/* pages inactive (reclaim or free) */ struct pglist page_inactive_obj;/* pages inactive (reclaim or free) */ diff --git a/sys/uvm/uvm_extern.h b/sys/uvm/uvm_extern.h index 5ff1e2ddad2..bc6a766590a 100644 --- a/sys/uvm/uvm_extern.h +++ b/sys/uvm/uvm_extern.h @@ -1,4 +1,4 @@ -/* $OpenBSD: uvm_extern.h,v 1.76 2009/04/20 00:30:18 oga Exp $ */ +/* $OpenBSD: uvm_extern.h,v 1.77 2009/06/01 17:42:33 ariane Exp $ */ /* $NetBSD: uvm_extern.h,v 1.57 2001/03/09 01:02:12 chs Exp $ */ /* @@ -221,11 +221,12 @@ typedef int vm_prot_t; #define UVM_PGA_ZERO 0x0002 /* returned page must be zeroed */ /* - * flags for uvm_pglistalloc() + * flags for uvm_pglistalloc() and uvm_pmr_getpages() */ #define UVM_PLA_WAITOK 0x0001 /* may sleep */ #define UVM_PLA_NOWAIT 0x0002 /* can't sleep (need one of the two) */ #define UVM_PLA_ZERO 0x0004 /* zero all pages before returning */ +#define UVM_PLA_TRY_CONTIG 0x0008 /* try to allocate a contig range */ /* * lockflags that control the locking behavior of various functions. @@ -589,6 +590,10 @@ int uvm_pglistalloc(psize_t, paddr_t, struct pglist *, int, int); void uvm_pglistfree(struct pglist *); +/* uvm_pmemrange.c */ + +void uvm_pmr_use_inc(paddr_t, paddr_t); + /* uvm_swap.c */ void uvm_swap_init(void); diff --git a/sys/uvm/uvm_map.c b/sys/uvm/uvm_map.c index 8858a585027..1b6f3262986 100644 --- a/sys/uvm/uvm_map.c +++ b/sys/uvm/uvm_map.c @@ -1,4 +1,4 @@ -/* $OpenBSD: uvm_map.c,v 1.110 2009/05/02 12:54:42 oga Exp $ */ +/* $OpenBSD: uvm_map.c,v 1.111 2009/06/01 17:42:33 ariane Exp $ */ /* $NetBSD: uvm_map.c,v 1.86 2000/11/27 08:40:03 chs Exp $ */ /* @@ -3822,7 +3822,7 @@ uvm_object_printit(uobj, full, pr) (*pr)(" PAGES <pg,offset>:\n "); for (pg = TAILQ_FIRST(&uobj->memq); pg != NULL; - pg = TAILQ_NEXT(pg, listq), cnt++) { + pg = TAILQ_NEXT(pg, fq.queues.listq), cnt++) { (*pr)("<%p,0x%llx> ", pg, (long long)pg->offset); if ((cnt % 3) == 2) { (*pr)("\n "); @@ -3883,7 +3883,8 @@ uvm_page_printit(pg, full, pr) uobj = pg->uobject; if (uobj) { (*pr)(" checking object list\n"); - TAILQ_FOREACH(tpg, &uobj->memq, listq) { + TAILQ_FOREACH(tpg, &uobj->memq, + fq.queues.listq) { if (tpg == pg) { break; } @@ -3898,9 +3899,11 @@ uvm_page_printit(pg, full, pr) /* cross-verify page queue */ if (pg->pg_flags & PQ_FREE) { - int fl = uvm_page_lookup_freelist(pg); - pgl = &uvm.page_free[fl].pgfl_queues[((pg)->pg_flags & PG_ZERO) ? - PGFL_ZEROS : PGFL_UNKNOWN]; + if (uvm_pmr_isfree(pg)) + printf(" page found in uvm_pmemrange\n"); + else + printf(" >>> page not found in uvm_pmemrange <<<\n"); + pgl = NULL; } else if (pg->pg_flags & PQ_INACTIVE) { pgl = (pg->pg_flags & PQ_SWAPBACKED) ? &uvm.page_inactive_swp : &uvm.page_inactive_obj; diff --git a/sys/uvm/uvm_page.c b/sys/uvm/uvm_page.c index 39008ac0c19..7c6e257ccb5 100644 --- a/sys/uvm/uvm_page.c +++ b/sys/uvm/uvm_page.c @@ -1,4 +1,4 @@ -/* $OpenBSD: uvm_page.c,v 1.80 2009/05/08 15:10:35 ariane Exp $ */ +/* $OpenBSD: uvm_page.c,v 1.81 2009/06/01 17:42:33 ariane Exp $ */ /* $NetBSD: uvm_page.c,v 1.44 2000/11/27 08:40:04 chs Exp $ */ /* @@ -159,10 +159,11 @@ uvm_pageinsert(struct vm_page *pg) KASSERT((pg->pg_flags & PG_TABLED) == 0); mtx_enter(&uvm.hashlock); buck = &uvm.page_hash[uvm_pagehash(pg->uobject,pg->offset)]; - TAILQ_INSERT_TAIL(buck, pg, hashq); /* put in hash */ + TAILQ_INSERT_TAIL(buck, pg, fq.queues.hashq); /* put in hash */ mtx_leave(&uvm.hashlock); - TAILQ_INSERT_TAIL(&pg->uobject->memq, pg, listq); /* put in object */ + TAILQ_INSERT_TAIL(&pg->uobject->memq, pg, + fq.queues.listq); /* put in object */ atomic_setbits_int(&pg->pg_flags, PG_TABLED); pg->uobject->uo_npages++; } @@ -183,7 +184,7 @@ uvm_pageremove(struct vm_page *pg) KASSERT(pg->pg_flags & PG_TABLED); mtx_enter(&uvm.hashlock); buck = &uvm.page_hash[uvm_pagehash(pg->uobject,pg->offset)]; - TAILQ_REMOVE(buck, pg, hashq); + TAILQ_REMOVE(buck, pg, fq.queues.hashq); mtx_leave(&uvm.hashlock); #ifdef UBC @@ -193,7 +194,7 @@ uvm_pageremove(struct vm_page *pg) #endif /* object should be locked */ - TAILQ_REMOVE(&pg->uobject->memq, pg, listq); + TAILQ_REMOVE(&pg->uobject->memq, pg, fq.queues.listq); atomic_clearbits_int(&pg->pg_flags, PG_TABLED|PQ_AOBJ); pg->uobject->uo_npages--; @@ -226,15 +227,12 @@ uvm_page_init(vaddr_t *kvm_startp, vaddr_t *kvm_endp) * init the page queues and page queue locks */ - for (lcv = 0; lcv < VM_NFREELIST; lcv++) { - for (i = 0; i < PGFL_NQUEUES; i++) - TAILQ_INIT(&uvm.page_free[lcv].pgfl_queues[i]); - } TAILQ_INIT(&uvm.page_active); TAILQ_INIT(&uvm.page_inactive_swp); TAILQ_INIT(&uvm.page_inactive_obj); simple_lock_init(&uvm.pageqlock); mtx_init(&uvm.fpageqlock, IPL_VM); + uvm_pmr_init(); /* * init the <obj,offset> => <page> hash table. for now @@ -319,10 +317,13 @@ uvm_page_init(vaddr_t *kvm_startp, vaddr_t *kvm_endp) if (atop(paddr) >= vm_physmem[lcv].avail_start && atop(paddr) <= vm_physmem[lcv].avail_end) { uvmexp.npages++; - /* add page to free pool */ - uvm_pagefree(&vm_physmem[lcv].pgs[i]); } } + + /* add pages to free pool */ + uvm_pmr_freepages(&vm_physmem[lcv].pgs[ + vm_physmem[lcv].avail_start - vm_physmem[lcv].start], + vm_physmem[lcv].avail_end - vm_physmem[lcv].avail_start); } /* @@ -811,10 +812,10 @@ uvm_page_rehash(void) /* ... and rehash */ for (lcv = 0 ; lcv < oldcount ; lcv++) { while ((pg = TAILQ_FIRST(&oldbuckets[lcv])) != NULL) { - TAILQ_REMOVE(&oldbuckets[lcv], pg, hashq); + TAILQ_REMOVE(&oldbuckets[lcv], pg, fq.queues.hashq); TAILQ_INSERT_TAIL( &uvm.page_hash[uvm_pagehash(pg->uobject, pg->offset)], - pg, hashq); + pg, fq.queues.hashq); } } mtx_leave(&uvm.hashlock); @@ -892,18 +893,15 @@ struct vm_page * uvm_pagealloc_strat(struct uvm_object *obj, voff_t off, struct vm_anon *anon, int flags, int strat, int free_list) { - int lcv, try1, try2, zeroit = 0; + struct pglist pgl; + int pmr_flags; struct vm_page *pg; - struct pglist *freeq; - struct pgfreelist *pgfl; boolean_t use_reserve; UVMHIST_FUNC("uvm_pagealloc_strat"); UVMHIST_CALLED(pghist); KASSERT(obj == NULL || anon == NULL); KASSERT(off == trunc_page(off)); - uvm_lock_fpageq(); - /* * check to see if we need to generate some free pages waking * the pagedaemon. @@ -930,95 +928,20 @@ uvm_pagealloc_strat(struct uvm_object *obj, voff_t off, struct vm_anon *anon, (curproc == syncerproc)))) goto fail; -#if PGFL_NQUEUES != 2 -#error uvm_pagealloc_strat needs to be updated -#endif - - /* - * If we want a zero'd page, try the ZEROS queue first, otherwise - * we try the UNKNOWN queue first. - */ - if (flags & UVM_PGA_ZERO) { - try1 = PGFL_ZEROS; - try2 = PGFL_UNKNOWN; - } else { - try1 = PGFL_UNKNOWN; - try2 = PGFL_ZEROS; - } - - UVMHIST_LOG(pghist, "obj=%p off=%lx anon=%p flags=%lx", - obj, (u_long)off, anon, flags); - UVMHIST_LOG(pghist, "strat=%ld free_list=%ld", strat, free_list, 0, 0); - again: - switch (strat) { - case UVM_PGA_STRAT_NORMAL: - /* Check all freelists in descending priority order. */ - for (lcv = 0; lcv < VM_NFREELIST; lcv++) { - pgfl = &uvm.page_free[lcv]; - if ((pg = TAILQ_FIRST((freeq = - &pgfl->pgfl_queues[try1]))) != NULL || - (pg = TAILQ_FIRST((freeq = - &pgfl->pgfl_queues[try2]))) != NULL) - goto gotit; - } - - /* No pages free! */ - goto fail; - - case UVM_PGA_STRAT_ONLY: - case UVM_PGA_STRAT_FALLBACK: - /* Attempt to allocate from the specified free list. */ - KASSERT(free_list >= 0 && free_list < VM_NFREELIST); - pgfl = &uvm.page_free[free_list]; - if ((pg = TAILQ_FIRST((freeq = - &pgfl->pgfl_queues[try1]))) != NULL || - (pg = TAILQ_FIRST((freeq = - &pgfl->pgfl_queues[try2]))) != NULL) - goto gotit; - - /* Fall back, if possible. */ - if (strat == UVM_PGA_STRAT_FALLBACK) { - strat = UVM_PGA_STRAT_NORMAL; - goto again; - } - - /* No pages free! */ + pmr_flags = UVM_PLA_NOWAIT; + if (flags & UVM_PGA_ZERO) + pmr_flags |= UVM_PLA_ZERO; + TAILQ_INIT(&pgl); + if (uvm_pmr_getpages(1, 0, 0, 1, 0, 1, pmr_flags, &pgl) != 0) goto fail; - - default: - panic("uvm_pagealloc_strat: bad strat %d", strat); - /* NOTREACHED */ - } - - gotit: - TAILQ_REMOVE(freeq, pg, pageq); - uvmexp.free--; - - /* update zero'd page count */ - if (pg->pg_flags & PG_ZERO) - uvmexp.zeropages--; - - /* - * update allocation statistics and remember if we have to - * zero the page - */ - if (flags & UVM_PGA_ZERO) { - if (pg->pg_flags & PG_ZERO) { - uvmexp.pga_zerohit++; - zeroit = 0; - } else { - uvmexp.pga_zeromiss++; - zeroit = 1; - } - } - - uvm_unlock_fpageq(); /* unlock free page queue */ + pg = TAILQ_FIRST(&pgl); + KASSERT(pg != NULL); + KASSERT(TAILQ_NEXT(pg, pageq) == NULL); pg->offset = off; pg->uobject = obj; pg->uanon = anon; pg->pg_flags = PG_BUSY|PG_CLEAN|PG_FAKE; - pg->pg_version++; if (anon) { anon->an_page = pg; atomic_setbits_int(&pg->pg_flags, PQ_ANON); @@ -1034,22 +957,11 @@ uvm_pagealloc_strat(struct uvm_object *obj, voff_t off, struct vm_anon *anon, #endif UVM_PAGE_OWN(pg, "new alloc"); - if (flags & UVM_PGA_ZERO) { - /* - * A zero'd page is not clean. If we got a page not already - * zero'd, then we have to zero it ourselves. - */ - atomic_clearbits_int(&pg->pg_flags, PG_CLEAN); - if (zeroit) - pmap_zero_page(pg); - } - UVMHIST_LOG(pghist, "allocated pg %p/%lx", pg, (u_long)VM_PAGE_TO_PHYS(pg), 0, 0); return(pg); fail: - uvm_unlock_fpageq(); UVMHIST_LOG(pghist, "failed!", 0, 0, 0, 0); return (NULL); } @@ -1100,6 +1012,7 @@ uvm_pagerealloc(struct vm_page *pg, struct uvm_object *newobj, voff_t newoff) void uvm_pagefree(struct vm_page *pg) { + struct pglist pgl; int saved_loan_count = pg->loan_count; UVMHIST_FUNC("uvm_pagefree"); UVMHIST_CALLED(pghist); @@ -1195,27 +1108,35 @@ uvm_pagefree(struct vm_page *pg) } /* - * and put on free queue + * Clean page state bits. */ - - atomic_clearbits_int(&pg->pg_flags, PG_ZERO); - - uvm_lock_fpageq(); - TAILQ_INSERT_TAIL(&uvm.page_free[ - uvm_page_lookup_freelist(pg)].pgfl_queues[PGFL_UNKNOWN], pg, pageq); - atomic_clearbits_int(&pg->pg_flags, PQ_MASK); - atomic_setbits_int(&pg->pg_flags, PQ_FREE); + atomic_clearbits_int(&pg->pg_flags, + PG_ZERO|PG_FAKE|PG_BUSY|PG_RELEASED|PG_CLEAN|PG_CLEANCHK); + /* + * Pmap flag cleaning. + * XXX: Shouldn't pmap do this? + */ + atomic_clearbits_int(&pg->pg_flags, + PG_PMAP0|PG_PMAP1|PG_PMAP2|PG_PMAP3); + +#if defined(DIAGNOSTIC) + if (pg->pg_flags != 0) { + panic("uvm_pagefree: expected page %p pg_flags to be 0\n" + "uvm_pagefree: instead of pg->pg_flags = %x\n", + VM_PAGE_TO_PHYS(pg), pg->pg_flags); + } +#endif #ifdef DEBUG pg->uobject = (void *)0xdeadbeef; pg->offset = 0xdeadbeef; pg->uanon = (void *)0xdeadbeef; #endif - uvmexp.free++; + TAILQ_INIT(&pgl); + TAILQ_INSERT_HEAD(&pgl, pg, pageq); + uvm_pmr_freepageq(&pgl); if (uvmexp.zeropages < UVM_PAGEZERO_TARGET) uvm.page_idle_zero = vm_page_zero_enable; - - uvm_unlock_fpageq(); } /* @@ -1308,6 +1229,7 @@ uvm_page_own(struct vm_page *pg, char *tag) void uvm_pageidlezero(void) { +#if 0 /* Disabled for now. */ struct vm_page *pg; struct pgfreelist *pgfl; int free_list; @@ -1374,6 +1296,7 @@ uvm_pageidlezero(void) uvmexp.zeropages++; uvm_unlock_fpageq(); } while (curcpu_is_idle()); +#endif /* 0 */ } /* @@ -1476,7 +1399,7 @@ uvm_pagelookup(struct uvm_object *obj, voff_t off) mtx_enter(&uvm.hashlock); buck = &uvm.page_hash[uvm_pagehash(obj,off)]; - TAILQ_FOREACH(pg, buck, hashq) { + TAILQ_FOREACH(pg, buck, fq.queues.hashq) { if (pg->uobject == obj && pg->offset == off) { break; } diff --git a/sys/uvm/uvm_page.h b/sys/uvm/uvm_page.h index e21562cd030..e7991dce4a0 100644 --- a/sys/uvm/uvm_page.h +++ b/sys/uvm/uvm_page.h @@ -1,4 +1,4 @@ -/* $OpenBSD: uvm_page.h,v 1.32 2009/04/28 16:06:07 miod Exp $ */ +/* $OpenBSD: uvm_page.h,v 1.33 2009/06/01 17:42:33 ariane Exp $ */ /* $NetBSD: uvm_page.h,v 1.19 2000/12/28 08:24:55 chs Exp $ */ /* @@ -106,11 +106,22 @@ #include <uvm/uvm_extern.h> #include <uvm/uvm_pglist.h> +union vm_page_fq { + struct { + TAILQ_ENTRY(vm_page) hashq; /* hash table links (O)*/ + TAILQ_ENTRY(vm_page) listq; /* pages in same object (O)*/ + } queues; + + struct { + RB_ENTRY(vm_page) tree; /* Free chunks, addr/size */ + psize_t pages; + } free; +}; + struct vm_page { + union vm_page_fq fq; /* free and queue management */ TAILQ_ENTRY(vm_page) pageq; /* queue info for FIFO * queue or free list (P) */ - TAILQ_ENTRY(vm_page) hashq; /* hash table links (O)*/ - TAILQ_ENTRY(vm_page) listq; /* pages in same object (O)*/ struct vm_anon *uanon; /* anon (O,P) */ struct uvm_object *uobject; /* object (O,P) */ diff --git a/sys/uvm/uvm_pglist.c b/sys/uvm/uvm_pglist.c index 093cd134b7f..ff0f8d91f68 100644 --- a/sys/uvm/uvm_pglist.c +++ b/sys/uvm/uvm_pglist.c @@ -1,4 +1,4 @@ -/* $OpenBSD: uvm_pglist.c,v 1.29 2009/05/04 18:08:06 oga Exp $ */ +/* $OpenBSD: uvm_pglist.c,v 1.30 2009/06/01 17:42:33 ariane Exp $ */ /* $NetBSD: uvm_pglist.c,v 1.13 2001/02/18 21:19:08 chs Exp $ */ /*- @@ -56,112 +56,6 @@ u_long uvm_pglistalloc_npages; #define STAT_DECR(v) #endif -int uvm_pglistalloc_simple(psize_t, paddr_t, paddr_t, struct pglist *); - -/* - * Simple page allocation: pages do not need to be contiguous. We just - * attempt to find enough free pages in the given range. - */ -int -uvm_pglistalloc_simple(psize_t size, paddr_t low, paddr_t high, - struct pglist *rlist) -{ - psize_t todo; - int psi; - struct vm_page *pg; - struct vm_physseg *seg; - paddr_t slow, shigh; - int pgflidx, error, free_list; - UVMHIST_FUNC("uvm_pglistalloc_simple"); UVMHIST_CALLED(pghist); -#ifdef DEBUG - vm_page_t tp; -#endif - - /* Default to "lose". */ - error = ENOMEM; - - todo = atop(size); - - /* - * Block all memory allocation and lock the free list. - */ - uvm_lock_fpageq(); - - /* Are there even any free pages? */ - if (uvmexp.free <= (uvmexp.reserve_pagedaemon + uvmexp.reserve_kernel)) - goto out; - - for (psi = 0, seg = vm_physmem; psi < vm_nphysseg; psi++, seg++) { - /* - * Skip this segment if incompatible with the address range. - */ - if (seg->avail_end <= atop(low)) - continue; - if (seg->avail_start >= atop(high)) - continue; - - slow = MAX(atop(low), seg->avail_start); - shigh = MIN(atop(high), seg->avail_end); - - /* we want to be able to allocate at least a page... */ - if (slow == shigh) - continue; - - for (pg = &seg->pgs[slow - seg->start]; slow != shigh; - slow++, pg++) { - if (VM_PAGE_IS_FREE(pg) == 0) - continue; - - free_list = uvm_page_lookup_freelist(pg); - pgflidx = (pg->pg_flags & PG_ZERO) ? - PGFL_ZEROS : PGFL_UNKNOWN; -#ifdef DEBUG - for (tp = TAILQ_FIRST(&uvm.page_free[free_list].pgfl_queues[pgflidx]); - tp != NULL; tp = TAILQ_NEXT(tp, pageq)) { - if (tp == pg) - break; - } - if (tp == NULL) - panic("uvm_pglistalloc_simple: page not on freelist"); -#endif - TAILQ_REMOVE(&uvm.page_free[free_list].pgfl_queues[pgflidx], - pg, pageq); - uvmexp.free--; - if (pg->pg_flags & PG_ZERO) - uvmexp.zeropages--; - pg->uobject = NULL; - pg->uanon = NULL; - pg->pg_version++; - TAILQ_INSERT_TAIL(rlist, pg, pageq); - STAT_INCR(uvm_pglistalloc_npages); - if (--todo == 0) { - error = 0; - goto out; - } - } - - } - -out: - /* - * check to see if we need to generate some free pages waking - * the pagedaemon. - */ - - if (!error && (uvmexp.free + uvmexp.paging < uvmexp.freemin || - (uvmexp.free + uvmexp.paging < uvmexp.freetarg && - uvmexp.inactive < uvmexp.inactarg))) { - wakeup(&uvm.pagedaemon_proc); - } - - uvm_unlock_fpageq(); - - if (error) - uvm_pglistfree(rlist); - - return (error); -} - /* * uvm_pglistalloc: allocate a list of pages * @@ -179,202 +73,45 @@ out: * alignment memory must be aligned to this power-of-two boundary. * boundary no segment in the allocation may cross this * power-of-two boundary (relative to zero). + * => flags: + * UVM_PLA_NOWAIT fail if allocation fails + * UVM_PLA_WAITOK wait for memory to become avail if allocation fails + * UVM_PLA_ZERO return zeroed memory + * UVM_PLA_TRY_CONTIG device prefers p-lineair mem */ int uvm_pglistalloc(psize_t size, paddr_t low, paddr_t high, paddr_t alignment, paddr_t boundary, struct pglist *rlist, int nsegs, int flags) { - int psi; - struct vm_page *pgs; - struct vm_physseg *seg; - paddr_t slow, shigh; - paddr_t try, idxpa, lastidxpa; - int tryidx, idx, pgflidx, endidx, error, free_list; - vm_page_t m; - u_long pagemask; -#ifdef DEBUG - vm_page_t tp; -#endif UVMHIST_FUNC("uvm_pglistalloc"); UVMHIST_CALLED(pghist); KASSERT((alignment & (alignment - 1)) == 0); KASSERT((boundary & (boundary - 1)) == 0); - /* - * This argument is always ignored for now, but ensure drivers always - * show intention. - */ KASSERT(!(flags & UVM_PLA_WAITOK) ^ !(flags & UVM_PLA_NOWAIT)); - - /* - * Our allocations are always page granularity, so our alignment - * must be, too. - */ - if (alignment < PAGE_SIZE) - alignment = PAGE_SIZE; if (size == 0) return (EINVAL); - size = round_page(size); - low = roundup(low, alignment); - /* - * If we are allowed to allocate as many segments as pages, - * no need to be smart. + * Convert byte addresses to page numbers. */ - if ((nsegs >= size / PAGE_SIZE) && (alignment == PAGE_SIZE) && - (boundary == 0)) { - error = uvm_pglistalloc_simple(size, low, high, rlist); - goto done; - } - - if (boundary != 0 && boundary < size) - return (EINVAL); - - pagemask = ~(boundary - 1); - - /* Default to "lose". */ - error = ENOMEM; - - /* - * Block all memory allocation and lock the free list. - */ - uvm_lock_fpageq(); - - /* Are there even any free pages? */ - if (uvmexp.free <= (uvmexp.reserve_pagedaemon + uvmexp.reserve_kernel)) - goto out; - - for (psi = 0, seg = vm_physmem; psi < vm_nphysseg; psi++, seg++) { - /* - * Skip this segment if incompatible with the address range. - */ - if (seg->avail_end <= atop(low)) - continue; - if (seg->avail_start >= atop(high)) - continue; - - slow = MAX(low, ptoa(seg->avail_start)); - shigh = MIN(high, ptoa(seg->avail_end)); - - try = roundup(slow, alignment); - for (;; try += alignment) { - if (try + size > shigh) { - /* - * We've run past the allowable range, or - * the segment. Try another. - */ - break; - } - - tryidx = idx = atop(try) - seg->start; - endidx = idx + atop(size); - pgs = vm_physmem[psi].pgs; - - /* - * Found a suitable starting page. See if the - * range is free. - */ - - for (; idx < endidx; idx++) { - if (VM_PAGE_IS_FREE(&pgs[idx]) == 0) { - break; - } - idxpa = VM_PAGE_TO_PHYS(&pgs[idx]); - if (idx == tryidx) - continue; - - /* - * Check that the region is contiguous - * (it really should...) and does not - * cross an alignment boundary. - */ - lastidxpa = VM_PAGE_TO_PHYS(&pgs[idx - 1]); - if ((lastidxpa + PAGE_SIZE) != idxpa) - break; - - if (boundary != 0 && - ((lastidxpa ^ idxpa) & pagemask) != 0) - break; - } - - if (idx == endidx) { - goto found; - } - } - } - - /* - * We could not allocate a contiguous range. This is where - * we should try harder if nsegs > 1... - */ - goto out; - -#if PGFL_NQUEUES != 2 -#error uvm_pglistalloc needs to be updated -#endif - -found: - /* - * we have a chunk of memory that conforms to the requested constraints. - */ - idx = tryidx; - while (idx < endidx) { - m = &pgs[idx]; - free_list = uvm_page_lookup_freelist(m); - pgflidx = (m->pg_flags & PG_ZERO) ? PGFL_ZEROS : PGFL_UNKNOWN; -#ifdef DEBUG - for (tp = TAILQ_FIRST(&uvm.page_free[ - free_list].pgfl_queues[pgflidx]); - tp != NULL; - tp = TAILQ_NEXT(tp, pageq)) { - if (tp == m) - break; - } - if (tp == NULL) - panic("uvm_pglistalloc: page not on freelist"); -#endif - TAILQ_REMOVE(&uvm.page_free[free_list].pgfl_queues[pgflidx], - m, pageq); - uvmexp.free--; - if (m->pg_flags & PG_ZERO) - uvmexp.zeropages--; - m->uobject = NULL; - m->uanon = NULL; - m->pg_version++; - TAILQ_INSERT_TAIL(rlist, m, pageq); - idx++; - STAT_INCR(uvm_pglistalloc_npages); - } - error = 0; - -out: - /* - * check to see if we need to generate some free pages waking - * the pagedaemon. - */ - - if (uvmexp.free + uvmexp.paging < uvmexp.freemin || - (uvmexp.free + uvmexp.paging < uvmexp.freetarg && - uvmexp.inactive < uvmexp.inactarg)) { - wakeup(&uvm.pagedaemon_proc); - } - - uvm_unlock_fpageq(); - -done: - /* No locking needed here, pages are not on any queue. */ - if (error == 0) { - TAILQ_FOREACH(m, rlist, pageq) { - if (flags & UVM_PLA_ZERO && - (m->pg_flags & PG_ZERO) == 0) - uvm_pagezero(m); - m->pg_flags = PG_CLEAN; - } - } - - return (error); + if (alignment < PAGE_SIZE) + alignment = PAGE_SIZE; + low = atop(roundup(low, alignment)); + /* Allows for overflow: 0xffff + 1 = 0x0000 */ + if ((high & PAGE_MASK) == PAGE_MASK) + high = atop(high) + 1; + else + high = atop(high); + size = atop(round_page(size)); + alignment = atop(alignment); + if (boundary < PAGE_SIZE && boundary != 0) + boundary = PAGE_SIZE; + boundary = atop(boundary); + + return uvm_pmr_getpages(size, low, high, alignment, boundary, nsegs, + flags, rlist); } /* @@ -389,14 +126,8 @@ uvm_pglistfree(struct pglist *list) struct vm_page *m; UVMHIST_FUNC("uvm_pglistfree"); UVMHIST_CALLED(pghist); - /* - * Block all memory allocation and lock the free list. - */ - uvm_lock_fpageq(); - - while ((m = TAILQ_FIRST(list)) != NULL) { + TAILQ_FOREACH(m, list, pageq) { KASSERT((m->pg_flags & (PQ_ACTIVE|PQ_INACTIVE)) == 0); - TAILQ_REMOVE(list, m, pageq); #ifdef DEBUG if (m->uobject == (void *)0xdeadbeef && m->uanon == (void *)0xdeadbeef) { @@ -408,15 +139,6 @@ uvm_pglistfree(struct pglist *list) m->uanon = (void *)0xdeadbeef; #endif atomic_clearbits_int(&m->pg_flags, PQ_MASK); - atomic_setbits_int(&m->pg_flags, PQ_FREE); - TAILQ_INSERT_TAIL(&uvm.page_free[ - uvm_page_lookup_freelist(m)].pgfl_queues[PGFL_UNKNOWN], - m, pageq); - uvmexp.free++; - if (uvmexp.zeropages < UVM_PAGEZERO_TARGET) - uvm.page_idle_zero = vm_page_zero_enable; - STAT_DECR(uvm_pglistalloc_npages); } - - uvm_unlock_fpageq(); + uvm_pmr_freepageq(list); } diff --git a/sys/uvm/uvm_pmemrange.c b/sys/uvm/uvm_pmemrange.c new file mode 100644 index 00000000000..86a0d137a97 --- /dev/null +++ b/sys/uvm/uvm_pmemrange.c @@ -0,0 +1,1248 @@ +/* $OpenBSD: uvm_pmemrange.c,v 1.1 2009/06/01 17:42:33 ariane Exp $ */ + +/* + * Copyright (c) 2009 Ariane van der Steldt <ariane@stack.nl> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include <sys/param.h> +#include <uvm/uvm.h> +#include <sys/malloc.h> + +/* + * 2 trees: addr tree and size tree. + * + * addr tree is vm_page[0].fq.free.tree + * size tree is vm_page[1].fq.free.tree + * + * The size tree is not used for memory ranges of 1 page, instead, + * single queue is vm_page[0].pageq + * + * uvm_page_init guarantees that every vm_physseg contains an array of + * struct vm_page. Also, uvm_page_physload allocates an array of struct + * vm_page. This code depends on that array. + */ + +/* Tree comparators. */ +int uvm_pmemrange_addr_cmp(struct uvm_pmemrange *, struct uvm_pmemrange *); +int uvm_pmemrange_use_cmp(struct uvm_pmemrange *, struct uvm_pmemrange *); +int uvm_pmr_addr_cmp(struct vm_page *, struct vm_page *); +int uvm_pmr_size_cmp(struct vm_page *, struct vm_page *); + +/* Memory types. The page flags are used to derive what the current memory + * type of a page is. */ +static __inline int +uvm_pmr_pg_to_memtype(struct vm_page *pg) +{ + if (pg->pg_flags & PG_ZERO) + return UVM_PMR_MEMTYPE_ZERO; + /* Default: dirty memory. */ + return UVM_PMR_MEMTYPE_DIRTY; +} + +/* Cancel static calls (for profiling). */ +#define static +#define __inline +/* Trees. */ +RB_PROTOTYPE(uvm_pmr_addr, vm_page, fq.free.tree, uvm_pmr_addr_cmp); +RB_PROTOTYPE(uvm_pmr_size, vm_page, fq.free.tree, uvm_pmr_size_cmp); +RB_PROTOTYPE(uvm_pmemrange_addr, uvm_pmemrange, pmr_addr, + uvm_pmemrange_addr_cmp); +RB_GENERATE(uvm_pmr_addr, vm_page, fq.free.tree, uvm_pmr_addr_cmp); +RB_GENERATE(uvm_pmr_size, vm_page, fq.free.tree, uvm_pmr_size_cmp); +RB_GENERATE(uvm_pmemrange_addr, uvm_pmemrange, pmr_addr, + uvm_pmemrange_addr_cmp); +#undef static +#undef __inline + +/* Validation. */ +#ifdef DEBUG +void uvm_pmr_assertvalid(struct uvm_pmemrange *pmr); +#else +#define uvm_pmr_assertvalid(pmr) do {} while (0) +#endif + + +int uvm_pmr_get1page(psize_t, int, struct pglist *, + paddr_t, paddr_t); + +struct uvm_pmemrange *uvm_pmr_allocpmr(void); +struct vm_page *uvm_pmr_nfindsz(struct uvm_pmemrange *, psize_t, int); +struct vm_page *uvm_pmr_nextsz(struct uvm_pmemrange *, + struct vm_page *, int); +void uvm_pmr_pnaddr(struct uvm_pmemrange *pmr, + struct vm_page *pg, struct vm_page **pg_prev, + struct vm_page **pg_next); +struct vm_page *uvm_pmr_insert(struct uvm_pmemrange *, + struct vm_page *, int); +void uvm_pmr_remove(struct uvm_pmemrange *, + struct vm_page *); +psize_t uvm_pmr_remove_1strange(struct pglist *, paddr_t, + struct vm_page **); +void uvm_pmr_split(paddr_t); +struct uvm_pmemrange *uvm_pmemrange_find(paddr_t); +struct uvm_pmemrange *uvm_pmemrange_use_insert(struct uvm_pmemrange_use *, + struct uvm_pmemrange *); +struct vm_page *uvm_pmr_extract_range(struct uvm_pmemrange *, + struct vm_page *, paddr_t, paddr_t, + struct pglist *); + +/* + * Computes num/denom and rounds it up to the next power-of-2. + */ +static __inline psize_t +pow2divide(psize_t num, psize_t denom) +{ + int rshift = 0; + + while (num > (denom << rshift)) + rshift++; + return (paddr_t)1 << rshift; +} + +/* + * Predicate: lhs is a subrange or rhs. + */ +#define PMR_IS_SUBRANGE_OF(lhs_low, lhs_high, rhs_low, rhs_high) \ + ((lhs_low) >= (rhs_low) && (lhs_high <= rhs_high)) + +/* + * Align to power-of-2 alignment. + */ +#define PMR_ALIGN(pgno, align) \ + (((pgno) + ((align) - 1)) & ~((align) - 1)) + + +/* + * Comparator: sort by address ascending. + */ +int +uvm_pmemrange_addr_cmp(struct uvm_pmemrange *lhs, struct uvm_pmemrange *rhs) +{ + return lhs->low < rhs->low ? -1 : lhs->low > rhs->low; +} + +/* + * Comparator: sort by use ascending. + * + * The higher the use value of a range, the more devices need memory in + * this range. Therefor allocate from the range with the lowest use first. + */ +int +uvm_pmemrange_use_cmp(struct uvm_pmemrange *lhs, struct uvm_pmemrange *rhs) +{ + int result; + + result = lhs->use < rhs->use ? -1 : lhs->use > rhs->use; + if (result == 0) + result = uvm_pmemrange_addr_cmp(lhs, rhs); + return result; +} + +int +uvm_pmr_addr_cmp(struct vm_page *lhs, struct vm_page *rhs) +{ + paddr_t lhs_addr, rhs_addr; + + lhs_addr = VM_PAGE_TO_PHYS(lhs); + rhs_addr = VM_PAGE_TO_PHYS(rhs); + + return (lhs_addr < rhs_addr ? -1 : lhs_addr > rhs_addr); +} + +int +uvm_pmr_size_cmp(struct vm_page *lhs, struct vm_page *rhs) +{ + psize_t lhs_size, rhs_size; + int cmp; + + /* Using second tree, so we receive pg[1] instead of pg[0]. */ + lhs_size = (lhs - 1)->fq.free.pages; + rhs_size = (rhs - 1)->fq.free.pages; + + cmp = (lhs_size < rhs_size ? -1 : lhs_size > rhs_size); + if (cmp == 0) + cmp = uvm_pmr_addr_cmp(lhs - 1, rhs - 1); + return cmp; +} + +/* + * Find the first range of free pages that is at least sz pages long. + */ +struct vm_page * +uvm_pmr_nfindsz(struct uvm_pmemrange *pmr, psize_t sz, int mti) +{ + struct vm_page *node, *best; + + KASSERT(sz >= 1); + + if (sz == 1 && !TAILQ_EMPTY(&pmr->single[mti])) + return TAILQ_FIRST(&pmr->single[mti]); + + node = RB_ROOT(&pmr->size[mti]); + best = NULL; + while (node != NULL) { + if ((node - 1)->fq.free.pages >= sz) { + best = (node - 1); + node = RB_LEFT(node, fq.free.tree); + } else + node = RB_RIGHT(node, fq.free.tree); + } + return best; +} + +/* + * Finds the next range. The next range has a size >= pg->fq.free.pages. + * Returns NULL if no more ranges are available. + */ +struct vm_page * +uvm_pmr_nextsz(struct uvm_pmemrange *pmr, struct vm_page *pg, int mt) +{ + struct vm_page *npg; + + KASSERT(pmr != NULL && pg != NULL); + if (pg->fq.free.pages == 1) { + if (TAILQ_NEXT(pg, pageq) != NULL) + return TAILQ_NEXT(pg, pageq); + else + npg = RB_MIN(uvm_pmr_size, &pmr->size[mt]); + } else + npg = RB_NEXT(uvm_pmr_size, &pmr->size[mt], pg + 1); + + return npg == NULL ? NULL : npg - 1; +} + +/* + * Finds the previous and next ranges relative to the (uninserted) pg range. + * + * *pg_prev == NULL if no previous range is available, that can join with + * pg. + * *pg_next == NULL if no previous range is available, that can join with + * pg. + */ +void +uvm_pmr_pnaddr(struct uvm_pmemrange *pmr, struct vm_page *pg, + struct vm_page **pg_prev, struct vm_page **pg_next) +{ + KASSERT(pg_prev != NULL && pg_next != NULL); + + *pg_next = RB_NFIND(uvm_pmr_addr, &pmr->addr, pg); + if (*pg_next == NULL) + *pg_prev = RB_MAX(uvm_pmr_addr, &pmr->addr); + else + *pg_prev = RB_PREV(uvm_pmr_addr, &pmr->addr, *pg_next); + + /* Reset if not contig. */ + if (*pg_prev != NULL && + (atop(VM_PAGE_TO_PHYS(*pg_prev)) + (*pg_prev)->fq.free.pages + != atop(VM_PAGE_TO_PHYS(pg)) || + uvm_pmr_pg_to_memtype(*pg_prev) != uvm_pmr_pg_to_memtype(pg))) + *pg_prev = NULL; + if (*pg_next != NULL && + (atop(VM_PAGE_TO_PHYS(pg)) + pg->fq.free.pages + != atop(VM_PAGE_TO_PHYS(*pg_next)) || + uvm_pmr_pg_to_memtype(*pg_next) != uvm_pmr_pg_to_memtype(pg))) + *pg_next = NULL; + return; +} + +/* + * Remove a range from the address tree. + * Address tree maintains pmr counters. + */ +static __inline void +uvm_pmr_remove_addr(struct uvm_pmemrange *pmr, struct vm_page *pg) +{ + KDASSERT(RB_FIND(uvm_pmr_addr, &pmr->addr, pg) == pg); + KASSERT(pg->pg_flags & PQ_FREE); + RB_REMOVE(uvm_pmr_addr, &pmr->addr, pg); + + pmr->nsegs--; +} +/* + * Remove a range from the size tree. + */ +static __inline void +uvm_pmr_remove_size(struct uvm_pmemrange *pmr, struct vm_page *pg) +{ + int memtype; +#ifdef DEBUG + struct vm_page *i; +#endif + + KASSERT(pg->pg_flags & PQ_FREE); + memtype = uvm_pmr_pg_to_memtype(pg); + + if (pg->fq.free.pages == 1) { +#ifdef DEBUG + TAILQ_FOREACH(i, &pmr->single[memtype], pageq) { + if (i == pg) + break; + } + KDASSERT(i == pg); +#endif + TAILQ_REMOVE(&pmr->single[memtype], pg, pageq); + } else { + KDASSERT(RB_FIND(uvm_pmr_size, &pmr->size[memtype], + pg + 1) == pg + 1); + RB_REMOVE(uvm_pmr_size, &pmr->size[memtype], pg + 1); + } +} +/* Remove from both trees. */ +void +uvm_pmr_remove(struct uvm_pmemrange *pmr, struct vm_page *pg) +{ + uvm_pmr_assertvalid(pmr); + uvm_pmr_remove_size(pmr, pg); + uvm_pmr_remove_addr(pmr, pg); + uvm_pmr_assertvalid(pmr); +} + +/* + * Insert the range described in pg. + * Returns the range thus created (which may be joined with the previous and + * next ranges). + * If no_join, the caller guarantees that the range cannot possibly join + * with adjecent ranges. + */ +static __inline struct vm_page * +uvm_pmr_insert_addr(struct uvm_pmemrange *pmr, struct vm_page *pg, int no_join) +{ + struct vm_page *prev, *next; + +#ifdef DEBUG + struct vm_page *i; + int mt; + + for (mt = 0; mt < UVM_PMR_MEMTYPE_MAX; mt++) { + TAILQ_FOREACH(i, &pmr->single[mt], pageq) + KDASSERT(i != pg); + if (pg->fq.free.pages > 1) { + KDASSERT(RB_FIND(uvm_pmr_size, &pmr->size[mt], + pg + 1) == NULL); + } + KDASSERT(RB_FIND(uvm_pmr_addr, &pmr->addr, pg) == NULL); + } +#endif + + KASSERT(pg->pg_flags & PQ_FREE); + KASSERT(pg->fq.free.pages >= 1); + + if (!no_join) { + uvm_pmr_pnaddr(pmr, pg, &prev, &next); + if (next != NULL) { + uvm_pmr_remove_size(pmr, next); + uvm_pmr_remove_addr(pmr, next); + pg->fq.free.pages += next->fq.free.pages; + next->fq.free.pages = 0; + } + if (prev != NULL) { + uvm_pmr_remove_size(pmr, prev); + prev->fq.free.pages += pg->fq.free.pages; + pg->fq.free.pages = 0; + return prev; + } + } +#ifdef DEBUG + else { + uvm_pmr_pnaddr(pmr, pg, &prev, &next); + KDASSERT(prev == NULL && next == NULL); + } +#endif /* DEBUG */ + + RB_INSERT(uvm_pmr_addr, &pmr->addr, pg); + + pmr->nsegs++; + + return pg; +} +/* + * Insert the range described in pg. + * Returns the range thus created (which may be joined with the previous and + * next ranges). + * Page must already be in the address tree. + */ +static __inline void +uvm_pmr_insert_size(struct uvm_pmemrange *pmr, struct vm_page *pg) +{ + int memtype; +#ifdef DEBUG + struct vm_page *i; + int mti; +#endif + + memtype = uvm_pmr_pg_to_memtype(pg); +#ifdef DEBUG + for (mti = 0; mti < UVM_PMR_MEMTYPE_MAX; mti++) { + TAILQ_FOREACH(i, &pmr->single[mti], pageq) + KDASSERT(i != pg); + if (pg->fq.free.pages > 1) { + KDASSERT(RB_FIND(uvm_pmr_size, &pmr->size[mti], + pg + 1) == NULL); + } + KDASSERT(RB_FIND(uvm_pmr_addr, &pmr->addr, pg) == pg); + } + for (i = pg; i < pg + pg->fq.free.pages; i++) + KASSERT(uvm_pmr_pg_to_memtype(i) == memtype); +#endif + + KASSERT(pg->pg_flags & PQ_FREE); + KASSERT(pg->fq.free.pages >= 1); + + if (pg->fq.free.pages == 1) + TAILQ_INSERT_TAIL(&pmr->single[memtype], pg, pageq); + else + RB_INSERT(uvm_pmr_size, &pmr->size[memtype], pg + 1); +} +/* Insert in both trees. */ +struct vm_page * +uvm_pmr_insert(struct uvm_pmemrange *pmr, struct vm_page *pg, int no_join) +{ + uvm_pmr_assertvalid(pmr); + pg = uvm_pmr_insert_addr(pmr, pg, no_join); + uvm_pmr_insert_size(pmr, pg); + uvm_pmr_assertvalid(pmr); + return pg; +} + +/* + * Remove the first segment of contiguous pages from pgl. + * A segment ends if it crosses boundary (unless boundary = 0) or + * if it would enter a different uvm_pmemrange. + * + * Work: the page range that the caller is currently working with. + * May be null. + */ +psize_t +uvm_pmr_remove_1strange(struct pglist *pgl, paddr_t boundary, + struct vm_page **work) +{ + struct vm_page *pg, *pre_last, *last, *inserted; + psize_t count; + struct uvm_pmemrange *pmr; + paddr_t first_boundary; + + KASSERT(!TAILQ_EMPTY(pgl)); + + pg = TAILQ_FIRST(pgl); + pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(pg))); + KDASSERT(pmr != NULL); + if (boundary != 0) { + first_boundary = + PMR_ALIGN(atop(VM_PAGE_TO_PHYS(pg)) + 1, boundary); + } else + first_boundary = 0; + + /* Remove all pages in the first segment. */ + pre_last = pg; + last = TAILQ_NEXT(pre_last, pageq); + TAILQ_REMOVE(pgl, pre_last, pageq); + count = 1; + /* + * While loop checks the following: + * - last != NULL we have not reached the end of pgs + * - boundary == 0 || last < first_boundary + * we do not cross a boundary + * - atop(pre_last) + 1 == atop(last) + * still in the same segment + * - low <= last + * - high > last still testing the same memory range + * + * At the end of the loop, last points at the next segment + * and each page [pg, pre_last] (inclusive range) has been removed + * and count is the number of pages that have been removed. + */ + while (last != NULL && + (boundary == 0 || atop(VM_PAGE_TO_PHYS(last)) < first_boundary) && + atop(VM_PAGE_TO_PHYS(pre_last)) + 1 == + atop(VM_PAGE_TO_PHYS(last)) && + pmr->low <= atop(VM_PAGE_TO_PHYS(last)) && + pmr->high > atop(VM_PAGE_TO_PHYS(last))) { + count++; + pre_last = last; + last = TAILQ_NEXT(last, pageq); + TAILQ_REMOVE(pgl, pre_last, pageq); + } + KDASSERT(TAILQ_FIRST(pgl) == last); + KDASSERT(pg + (count - 1) == pre_last); + + pg->fq.free.pages = count; + inserted = uvm_pmr_insert(pmr, pg, 0); + + if (work != NULL && *work != NULL && + atop(VM_PAGE_TO_PHYS(inserted)) <= atop(VM_PAGE_TO_PHYS(*work)) && + atop(VM_PAGE_TO_PHYS(inserted)) + inserted->fq.free.pages > + atop(VM_PAGE_TO_PHYS(*work))) + *work = inserted; + return count; +} + +/* + * Extract a number of pages from a segment of free pages. + * Called by uvm_pmr_getpages. + * + * Returns the segment that was created from pages left over at the tail + * of the remove set of pages, or NULL if no pages were left at the tail. + */ +struct vm_page * +uvm_pmr_extract_range(struct uvm_pmemrange *pmr, struct vm_page *pg, + paddr_t start, paddr_t end, struct pglist *result) +{ + struct vm_page *after, *pg_i; + psize_t before_sz, after_sz; +#ifdef DEBUG + psize_t i; +#endif + + KASSERT(end > start); + KASSERT(pmr->low <= atop(VM_PAGE_TO_PHYS(pg))); + KASSERT(pmr->high >= atop(VM_PAGE_TO_PHYS(pg)) + pg->fq.free.pages); + KASSERT(atop(VM_PAGE_TO_PHYS(pg)) <= start); + KASSERT(atop(VM_PAGE_TO_PHYS(pg)) + pg->fq.free.pages >= end); + + before_sz = start - atop(VM_PAGE_TO_PHYS(pg)); + after_sz = atop(VM_PAGE_TO_PHYS(pg)) + pg->fq.free.pages - end; + KDASSERT(before_sz + after_sz + (end - start) == pg->fq.free.pages); + uvm_pmr_assertvalid(pmr); + + uvm_pmr_remove_size(pmr, pg); + if (before_sz == 0) + uvm_pmr_remove_addr(pmr, pg); + + /* Add selected pages to result. */ + for (pg_i = pg + before_sz; atop(VM_PAGE_TO_PHYS(pg_i)) < end; + pg_i++) { + pg_i->fq.free.pages = 0; + TAILQ_INSERT_TAIL(result, pg_i, pageq); + KDASSERT(pg_i->pg_flags & PQ_FREE); + } + + /* Before handling. */ + if (before_sz > 0) { + pg->fq.free.pages = before_sz; + uvm_pmr_insert_size(pmr, pg); + } + + /* After handling. */ + after = NULL; + if (after_sz > 0) { + after = pg + before_sz + (end - start); +#ifdef DEBUG + for (i = 0; i < after_sz; i++) { + KASSERT(!uvm_pmr_isfree(after + i)); + } +#endif + KDASSERT(atop(VM_PAGE_TO_PHYS(after)) == end); + after->fq.free.pages = after_sz; + after = uvm_pmr_insert_addr(pmr, after, 1); + uvm_pmr_insert_size(pmr, after); + } + + uvm_pmr_assertvalid(pmr); + return after; +} + +/* + * Acquire a number of pages. + * + * count: the number of pages returned + * start: lowest page number + * end: highest page number +1 + * (start = end = 0: no limitation) + * align: power-of-2 alignment constraint (align = 1: no alignment) + * boundary: power-of-2 boundary (boundary = 0: no boundary) + * maxseg: maximum number of segments to return + * flags: UVM_PLA_* flags + * result: returned pages storage (uses pageq) + */ +int +uvm_pmr_getpages(psize_t count, paddr_t start, paddr_t end, paddr_t align, + paddr_t boundary, int maxseg, int flags, struct pglist *result) +{ + struct uvm_pmemrange *pmr; /* Iterate memory ranges. */ + struct vm_page *found, *f_next; /* Iterate chunks. */ + psize_t fcount; /* Current found pages. */ + int fnsegs; /* Current segment counter. */ + int try, start_try; + psize_t search[2]; + paddr_t fstart, fend; /* Pages to be taken from found. */ + int memtype; /* Requested memtype. */ + int desperate; /* True if allocation failed. */ + + /* Validate arguments. */ + KASSERT(count > 0); + KASSERT((start == 0 && end == 0) || (start < end)); + KASSERT(align >= 1 && powerof2(align)); + KASSERT(maxseg > 0); + KASSERT(boundary == 0 || powerof2(boundary)); + KDASSERT(boundary == 0 || maxseg * boundary >= count); + KASSERT(TAILQ_EMPTY(result)); + + /* Configure search. If start_try == 0, search[0] should be faster + * (because it will have to throw away less segments). + * search[1] is the worst case: start searching at the smallest + * possible range instead of starting at the range most likely to + * fulfill the allocation. */ + start_try = 0; + search[0] = (flags & UVM_PLA_TRY_CONTIG ? count : + pow2divide(count, maxseg)); + search[1] = 1; + if (maxseg == 1) { + start_try = 1; + search[1] = count; + } else if (search[1] >= search[0]) + start_try = 1; + +ReTry: /* Return point after sleeping. */ + fcount = 0; + fnsegs = 0; + + /* Memory type: if zeroed memory is requested, traverse the zero set. + * Otherwise, traverse the dirty set. */ + if (flags & UVM_PLA_ZERO) + memtype = UVM_PMR_MEMTYPE_ZERO; + else + memtype = UVM_PMR_MEMTYPE_DIRTY; + desperate = 0; + + uvm_lock_fpageq(); + +ReTryDesperate: + /* + * If we just want any page(s), go for the really fast option. + */ + if (count <= maxseg && align == 1 && boundary == 0 && + (flags & UVM_PLA_TRY_CONTIG) == 0) { + if (!desperate) { + KASSERT(fcount == 0); + fcount += uvm_pmr_get1page(count, memtype, result, + start, end); + } else { + for (memtype = 0; memtype < UVM_PMR_MEMTYPE_MAX && + fcount < count; memtype++) { + fcount += uvm_pmr_get1page(count - fcount, + memtype, result, start, end); + } + } + + if (fcount == count) + goto Out; + else + goto Fail; + } + + TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) { + /* Empty range. */ + if (pmr->nsegs == 0) + continue; + + /* Outside requested range. */ + if (!(start == 0 && end == 0) && + !PMR_IS_SUBRANGE_OF(pmr->low, pmr->high, start, end)) + continue; + + try = start_try; +ReScan: /* Return point at try++. */ + + for (found = uvm_pmr_nfindsz(pmr, search[try], memtype); + found != NULL; + found = f_next) { + f_next = uvm_pmr_nextsz(pmr, found, memtype); + + fstart = atop(VM_PAGE_TO_PHYS(found)); +DrainFound: + /* Throw away the first segment if fnsegs == maxseg */ + if (fnsegs == maxseg) { + fnsegs--; + fcount -= + uvm_pmr_remove_1strange(result, boundary, + &found); + } + + fstart = PMR_ALIGN(fstart, align); + fend = atop(VM_PAGE_TO_PHYS(found)) + + found->fq.free.pages; + if (fstart >= fend) + continue; + if (boundary != 0) { + fend = + MIN(fend, PMR_ALIGN(fstart + 1, boundary)); + } + if (fend - fstart > count - fcount) + fend = fstart + (count - fcount); + + fcount += fend - fstart; + fnsegs++; + found = uvm_pmr_extract_range(pmr, found, + fstart, fend, result); + + if (fcount == count) + goto Out; + + /* If there's still space left in found, try to + * fully drain it prior to continueing. */ + if (found != NULL) { + fstart = fend; + goto DrainFound; + } + } + + if (++try < nitems(search)) + goto ReScan; + } + + /* + * Not enough memory of the requested type available. Fall back to + * less good memory that we'll clean up better later. + * + * This algorithm is not very smart though, it just starts scanning + * a different typed range, but the nicer ranges of the previous + * iteration may fall out. + */ + if (!desperate) { + desperate = 1; + memtype = 0; + goto ReTryDesperate; + } else if (++memtype < UVM_PMR_MEMTYPE_MAX) + goto ReTryDesperate; + +Fail: + /* + * Allocation failed. + */ + + /* XXX: claim from memory reserve here */ + + while (!TAILQ_EMPTY(result)) + uvm_pmr_remove_1strange(result, 0, NULL); + uvm_unlock_fpageq(); + + if (flags & UVM_PLA_WAITOK) { + uvm_wait("uvm_pmr_getpages"); + goto ReTry; + } else + wakeup(&uvm.pagedaemon_proc); + + return ENOMEM; + +Out: + + /* + * Allocation succesful. + */ + + uvmexp.free -= fcount; + + uvm_unlock_fpageq(); + + /* Update statistics and zero pages if UVM_PLA_ZERO. */ + TAILQ_FOREACH(found, result, pageq) { + if (found->pg_flags & PG_ZERO) { + uvmexp.zeropages--; + } + if (flags & UVM_PLA_ZERO) { + if (found->pg_flags & PG_ZERO) + uvmexp.pga_zerohit++; + else { + uvmexp.pga_zeromiss++; + uvm_pagezero(found); + } + } + atomic_clearbits_int(&found->pg_flags, PG_ZERO | PQ_FREE); + + found->uobject = NULL; + found->uanon = NULL; + found->pg_version++; + } + + return 0; +} + +/* + * Free a number of contig pages (invoked by uvm_page_init). + */ +void +uvm_pmr_freepages(struct vm_page *pg, psize_t count) +{ + struct uvm_pmemrange *pmr; + psize_t i, pmr_count; + + uvm_lock_fpageq(); + + for (i = 0; i < count; i++) { + atomic_clearbits_int(&pg[i].pg_flags, pg[i].pg_flags); + atomic_setbits_int(&pg[i].pg_flags, PQ_FREE); + } + + while (count > 0) { + pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(pg))); + KASSERT(pmr != NULL); + + pmr_count = MIN(count, pmr->high - atop(VM_PAGE_TO_PHYS(pg))); + pg->fq.free.pages = pmr_count; + uvm_pmr_insert(pmr, pg, 0); + + uvmexp.free += pmr_count; + count -= pmr_count; + pg += pmr_count; + } + wakeup(&uvmexp.free); + + uvm_unlock_fpageq(); +} + +/* + * Free all pages in the queue. + */ +void +uvm_pmr_freepageq(struct pglist *pgl) +{ + struct vm_page *pg; + + TAILQ_FOREACH(pg, pgl, pageq) { + atomic_clearbits_int(&pg->pg_flags, pg->pg_flags); + atomic_setbits_int(&pg->pg_flags, PQ_FREE); + } + + uvm_lock_fpageq(); + while (!TAILQ_EMPTY(pgl)) + uvmexp.free += uvm_pmr_remove_1strange(pgl, 0, NULL); + wakeup(&uvmexp.free); + uvm_unlock_fpageq(); + + return; +} + +/* + * Store a pmemrange in the list. + * + * The list is sorted by use. + */ +struct uvm_pmemrange * +uvm_pmemrange_use_insert(struct uvm_pmemrange_use *useq, + struct uvm_pmemrange *pmr) +{ + struct uvm_pmemrange *iter; + int cmp = 1; + + TAILQ_FOREACH(iter, useq, pmr_use) { + cmp = uvm_pmemrange_use_cmp(pmr, iter); + if (cmp == 0) + return iter; + if (cmp == -1) + break; + } + if (cmp == 0) + return iter; + + if (iter == NULL) + TAILQ_INSERT_TAIL(useq, pmr, pmr_use); + else + TAILQ_INSERT_BEFORE(iter, pmr, pmr_use); + return NULL; +} + +#ifdef DEBUG +/* + * Validation of the whole pmemrange. + * Called with fpageq locked. + */ +void +uvm_pmr_assertvalid(struct uvm_pmemrange *pmr) +{ + struct vm_page *prev, *next, *i, *xref; + int lcv, mti; + + /* Validate address tree. */ + RB_FOREACH(i, uvm_pmr_addr, &pmr->addr) { + /* Validate the range. */ + KASSERT(i->fq.free.pages > 0); + KASSERT(atop(VM_PAGE_TO_PHYS(i)) >= pmr->low); + KASSERT(atop(VM_PAGE_TO_PHYS(i)) + i->fq.free.pages + <= pmr->high); + + /* Validate each page in this range. */ + for (lcv = 0; lcv < i->fq.free.pages; lcv++) { + KASSERT(lcv == 0 || i[lcv].fq.free.pages == 0); + /* Flag check: + * - PG_ZERO: page is zeroed. + * - PQ_FREE: page is free. + * Any other flag is a mistake. */ + if (i[lcv].pg_flags != + (i[lcv].pg_flags & (PG_ZERO | PQ_FREE))) { + panic("i[%lu].pg_flags = %x, should be %x\n", + lcv, i[lcv].pg_flags, PG_ZERO | PQ_FREE); + } + /* Free pages are: + * - not wired + * - not loaned + * - have no vm_anon + * - have no uvm_object */ + KASSERT(i[lcv].wire_count == 0); + KASSERT(i[lcv].loan_count == 0); + KASSERT(i[lcv].uanon == NULL); + KASSERT(i[lcv].uobject == NULL); + /* Pages in a single range always have the same + * memtype. */ + KASSERT(uvm_pmr_pg_to_memtype(&i[0]) == + uvm_pmr_pg_to_memtype(&i[lcv])); + } + + /* Check that it shouldn't be joined with its predecessor. */ + prev = RB_PREV(uvm_pmr_addr, &pmr->addr, i); + if (prev != NULL) { + KASSERT(uvm_pmr_pg_to_memtype(&i[0]) != + uvm_pmr_pg_to_memtype(&i[lcv]) || + atop(VM_PAGE_TO_PHYS(i)) > + atop(VM_PAGE_TO_PHYS(prev)) + prev->fq.free.pages); + } + + /* Assert i is in the size tree as well. */ + if (i->fq.free.pages == 1) { + TAILQ_FOREACH(xref, + &pmr->single[uvm_pmr_pg_to_memtype(i)], pageq) { + if (xref == i) + break; + } + KASSERT(xref == i); + } else { + KASSERT(RB_FIND(uvm_pmr_size, + &pmr->size[uvm_pmr_pg_to_memtype(i)], i + 1) == + i + 1); + } + } + + /* Validate size tree. */ + for (mti = 0; mti < UVM_PMR_MEMTYPE_MAX; mti++) { + for (i = uvm_pmr_nfindsz(pmr, 1, mti); i != NULL; i = next) { + next = uvm_pmr_nextsz(pmr, i, mti); + if (next != NULL) { + KASSERT(i->fq.free.pages <= + next->fq.free.pages); + } + + /* Assert i is in the addr tree as well. */ + KASSERT(RB_FIND(uvm_pmr_addr, &pmr->addr, i) == i); + + /* Assert i is of the correct memory type. */ + KASSERT(uvm_pmr_pg_to_memtype(i) == mti); + } + } + + /* Validate nsegs statistic. */ + lcv = 0; + RB_FOREACH(i, uvm_pmr_addr, &pmr->addr) + lcv++; + KASSERT(pmr->nsegs == lcv); +} +#endif /* DEBUG */ + +/* + * Split pmr at split point pageno. + * Called with fpageq unlocked. + * + * Split is only applied if a pmemrange spans pageno. + */ +void +uvm_pmr_split(paddr_t pageno) +{ + struct uvm_pmemrange *pmr, *drain; + struct vm_page *rebuild, *prev, *next; + psize_t prev_sz; + + uvm_lock_fpageq(); + pmr = uvm_pmemrange_find(pageno); + if (pmr == NULL || !(pmr->low < pageno)) { + /* No split required. */ + uvm_unlock_fpageq(); + return; + } + + KASSERT(pmr->low < pageno); + KASSERT(pmr->high > pageno); + + drain = uvm_pmr_allocpmr(); + drain->low = pageno; + drain->high = pmr->high; + drain->use = pmr->use; + + uvm_pmr_assertvalid(pmr); + uvm_pmr_assertvalid(drain); + KASSERT(drain->nsegs == 0); + + RB_FOREACH(rebuild, uvm_pmr_addr, &pmr->addr) { + if (atop(VM_PAGE_TO_PHYS(rebuild)) >= pageno) + break; + } + if (rebuild == NULL) + prev = RB_MAX(uvm_pmr_addr, &pmr->addr); + else + prev = RB_PREV(uvm_pmr_addr, &pmr->addr, rebuild); + KASSERT(prev == NULL || atop(VM_PAGE_TO_PHYS(prev)) < pageno); + + /* + * Handle free chunk that spans the split point. + */ + if (prev != NULL && + atop(VM_PAGE_TO_PHYS(prev)) + prev->fq.free.pages > pageno) { + psize_t before, after; + + KASSERT(atop(VM_PAGE_TO_PHYS(prev)) < pageno); + + uvm_pmr_remove(pmr, prev); + prev_sz = prev->fq.free.pages; + before = pageno - atop(VM_PAGE_TO_PHYS(prev)); + after = atop(VM_PAGE_TO_PHYS(prev)) + prev_sz - pageno; + + KASSERT(before > 0); + KASSERT(after > 0); + + prev->fq.free.pages = before; + uvm_pmr_insert(pmr, prev, 1); + (prev + before)->fq.free.pages = after; + uvm_pmr_insert(drain, prev + before, 1); + } + + /* + * Move free chunks that no longer fall in the range. + */ + for (; rebuild != NULL; rebuild = next) { + next = RB_NEXT(uvm_pmr_addr, &pmr->addr, rebuild); + + uvm_pmr_remove(pmr, rebuild); + uvm_pmr_insert(drain, rebuild, 1); + } + + pmr->high = pageno; + uvm_pmr_assertvalid(pmr); + uvm_pmr_assertvalid(drain); + + RB_INSERT(uvm_pmemrange_addr, &uvm.pmr_control.addr, drain); + uvm_pmemrange_use_insert(&uvm.pmr_control.use, drain); + uvm_unlock_fpageq(); +} + +/* + * Increase the usage counter for the given range of memory. + * + * The more usage counters a given range of memory has, the more will be + * attempted not to allocate from it. + * + * Addresses here are in paddr_t, not page-numbers. + * The lowest and highest allowed address are specified. + */ +void +uvm_pmr_use_inc(paddr_t low, paddr_t high) +{ + struct uvm_pmemrange *pmr; + + /* + * If high+1 == 0, then you are increasing use of the whole address + * space, which won't make any difference. Skip in that case. + */ + high++; + if (high == 0) + return; + + /* + * pmr uses page numbers, translate low and high. + */ + low = atop(round_page(low)); + high = atop(trunc_page(high)); + uvm_pmr_split(low); + uvm_pmr_split(high); + + uvm_lock_fpageq(); + + /* Increase use count on segments in range. */ + RB_FOREACH(pmr, uvm_pmemrange_addr, &uvm.pmr_control.addr) { + if (PMR_IS_SUBRANGE_OF(pmr->low, pmr->high, low, high)) { + TAILQ_REMOVE(&uvm.pmr_control.use, pmr, pmr_use); + pmr->use++; + uvm_pmemrange_use_insert(&uvm.pmr_control.use, pmr); + } + uvm_pmr_assertvalid(pmr); + } + + uvm_unlock_fpageq(); +} + +/* + * Allocate a pmemrange. + * + * If called from uvm_page_init, the uvm_pageboot_alloc is used. + * If called after uvm_init, malloc is used. + * (And if called in between, you're dead.) + */ +struct uvm_pmemrange * +uvm_pmr_allocpmr() +{ + struct uvm_pmemrange *nw; + int i; + + if (!uvm.page_init_done) { + nw = (struct uvm_pmemrange *) + uvm_pageboot_alloc(sizeof(struct uvm_pmemrange)); + bzero(nw, sizeof(struct uvm_pmemrange)); + } else { + nw = malloc(sizeof(struct uvm_pmemrange), + M_VMMAP, M_NOWAIT | M_ZERO); + } + RB_INIT(&nw->addr); + for (i = 0; i < UVM_PMR_MEMTYPE_MAX; i++) { + RB_INIT(&nw->size[i]); + TAILQ_INIT(&nw->single[i]); + } + return nw; +} + +static const struct uvm_io_ranges uvm_io_ranges[] = UVM_IO_RANGES; + +/* + * Initialization of pmr. + * Called by uvm_page_init. + * + * Sets up pmemranges that maps the vm_physmem data. + */ +void +uvm_pmr_init(void) +{ + struct uvm_pmemrange *new_pmr; + int i; + + TAILQ_INIT(&uvm.pmr_control.use); + RB_INIT(&uvm.pmr_control.addr); + + for (i = 0 ; i < vm_nphysseg ; i++) { + new_pmr = uvm_pmr_allocpmr(); + + new_pmr->low = vm_physmem[i].start; + new_pmr->high = vm_physmem[i].end; + + RB_INSERT(uvm_pmemrange_addr, &uvm.pmr_control.addr, new_pmr); + uvm_pmemrange_use_insert(&uvm.pmr_control.use, new_pmr); + } + + for (i = 0; i < nitems(uvm_io_ranges); i++) + uvm_pmr_use_inc(uvm_io_ranges[i].low, uvm_io_ranges[i].high); +} + +/* + * Find the pmemrange that contains the given page number. + * + * (Manually traverses the binary tree, because that is cheaper on stack + * usage.) + */ +struct uvm_pmemrange * +uvm_pmemrange_find(paddr_t pageno) +{ + struct uvm_pmemrange *pmr; + + pmr = RB_ROOT(&uvm.pmr_control.addr); + while (pmr != NULL) { + if (pmr->low > pageno) + pmr = RB_LEFT(pmr, pmr_addr); + else if (pmr->high <= pageno) + pmr = RB_RIGHT(pmr, pmr_addr); + else + break; + } + + return pmr; +} + +#if defined(DDB) || defined(DEBUG) +/* + * Return true if the given page is in any of the free lists. + * Used by uvm_page_printit. + * This function is safe, even if the page is not on the freeq. + * Note: does not apply locking, only called from ddb. + */ +int +uvm_pmr_isfree(struct vm_page *pg) +{ + struct vm_page *r; + struct uvm_pmemrange *pmr; + + pmr = uvm_pmemrange_find(atop(VM_PAGE_TO_PHYS(pg))); + if (pmr == NULL) + return 0; + r = RB_NFIND(uvm_pmr_addr, &pmr->addr, pg); + if (r == NULL) + r = RB_MAX(uvm_pmr_addr, &pmr->addr); + else + r = RB_PREV(uvm_pmr_addr, &pmr->addr, r); + if (r == NULL) + return 0; /* Empty tree. */ + + KDASSERT(atop(VM_PAGE_TO_PHYS(r)) <= atop(VM_PAGE_TO_PHYS(pg))); + return atop(VM_PAGE_TO_PHYS(r)) + r->fq.free.pages > + atop(VM_PAGE_TO_PHYS(pg)); +} +#endif /* DEBUG */ + +/* + * Allocate any page, the fastest way. No constraints. + */ +int +uvm_pmr_get1page(psize_t count, int memtype, struct pglist *result, + paddr_t start, paddr_t end) +{ + struct uvm_pmemrange *pmr; + struct vm_page *found; + psize_t fcount; + + fcount = 0; + pmr = TAILQ_FIRST(&uvm.pmr_control.use); + while (pmr != NULL && fcount != count) { + /* Outside requested range. */ + if (!(start == 0 && end == 0) && + !PMR_IS_SUBRANGE_OF(pmr->low, pmr->high, start, end)) { + pmr = TAILQ_NEXT(pmr, pmr_use); + continue; + } + + found = TAILQ_FIRST(&pmr->single[memtype]); + if (found == NULL) { + found = RB_ROOT(&pmr->size[memtype]); + /* Size tree gives pg[1] instead of pg[0] */ + if (found != NULL) + found--; + } + if (found == NULL) { + pmr = TAILQ_NEXT(pmr, pmr_use); + continue; + } + + uvm_pmr_assertvalid(pmr); + uvm_pmr_remove_size(pmr, found); + while (found->fq.free.pages > 0 && fcount < count) { + found->fq.free.pages--; + fcount++; + TAILQ_INSERT_HEAD(result, + &found[found->fq.free.pages], pageq); + } + if (found->fq.free.pages > 0) { + uvm_pmr_insert_size(pmr, found); + KASSERT(fcount == count); + uvm_pmr_assertvalid(pmr); + return fcount; + } else + uvm_pmr_remove_addr(pmr, found); + uvm_pmr_assertvalid(pmr); + } + + /* Ran out of ranges before enough pages were gathered. */ + return fcount; +} diff --git a/sys/uvm/uvm_pmemrange.h b/sys/uvm/uvm_pmemrange.h new file mode 100644 index 00000000000..90219dc075b --- /dev/null +++ b/sys/uvm/uvm_pmemrange.h @@ -0,0 +1,83 @@ +/* $OpenBSD: uvm_pmemrange.h,v 1.1 2009/06/01 17:42:33 ariane Exp $ */ + +/* + * Copyright (c) 2009 Ariane van der Steldt <ariane@stack.nl> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/* + * uvm_pmemrange.h: describe and manage free physical memory. + */ + +#ifndef _UVM_UVM_PMEMRANGE_H_ +#define _UVM_UVM_PMEMRANGE_H_ + +#include <uvm/uvm_extern.h> +#include <uvm/uvm_page.h> + +RB_HEAD(uvm_pmr_addr, vm_page); +RB_HEAD(uvm_pmr_size, vm_page); + +/* + * Page types available: + * - DIRTY: this page may contain random data. + * - ZERO: this page has been zeroed. + */ +#define UVM_PMR_MEMTYPE_DIRTY 1 +#define UVM_PMR_MEMTYPE_ZERO 1 +#define UVM_PMR_MEMTYPE_MAX 2 + +/* + * An address range of memory. + */ +struct uvm_pmemrange { + struct uvm_pmr_addr addr; /* Free page chunks, sorted by addr. */ + struct uvm_pmr_size size[UVM_PMR_MEMTYPE_MAX]; + /* Free page chunks, sorted by size. */ + TAILQ_HEAD(, vm_page) single[UVM_PMR_MEMTYPE_MAX]; + /* single page regions (uses pageq) */ + + paddr_t low; /* Start of address range (pgno). */ + paddr_t high; /* End +1 (pgno). */ + int use; /* Use counter. */ + int nsegs; /* Current range count. */ + + TAILQ_ENTRY(uvm_pmemrange) pmr_use; + /* pmr, sorted by use */ + RB_ENTRY(uvm_pmemrange) pmr_addr; + /* pmr, sorted by address */ +}; + +RB_HEAD(uvm_pmemrange_addr, uvm_pmemrange); +TAILQ_HEAD(uvm_pmemrange_use, uvm_pmemrange); + +/* + * pmr control structure. Contained in uvm.pmr_control. + */ +struct uvm_pmr_control { + struct uvm_pmemrange_addr addr; + struct uvm_pmemrange_use use; +}; + +void uvm_pmr_freepages(struct vm_page *, psize_t); +void uvm_pmr_freepageq(struct pglist *pgl); +int uvm_pmr_getpages(psize_t, paddr_t, paddr_t, paddr_t, paddr_t, + int, int, struct pglist *); +void uvm_pmr_init(void); + +#ifdef DDB +int uvm_pmr_isfree(struct vm_page *pg); +#endif + +#endif /* _UVM_UVM_PMEMRANGE_H_ */ diff --git a/sys/uvm/uvm_vnode.c b/sys/uvm/uvm_vnode.c index 0ea3b01d07a..2e4870b745d 100644 --- a/sys/uvm/uvm_vnode.c +++ b/sys/uvm/uvm_vnode.c @@ -1,4 +1,4 @@ -/* $OpenBSD: uvm_vnode.c,v 1.58 2009/05/23 14:06:37 oga Exp $ */ +/* $OpenBSD: uvm_vnode.c,v 1.59 2009/06/01 17:42:33 ariane Exp $ */ /* $NetBSD: uvm_vnode.c,v 1.36 2000/11/24 20:34:01 chs Exp $ */ /* @@ -561,7 +561,7 @@ uvm_vnp_terminate(struct vnode *vp) while (uvn->u_obj.uo_npages) { #ifdef DEBUG struct vm_page *pp; - TAILQ_FOREACH(pp, &uvn->u_obj.memq, listq) { + TAILQ_FOREACH(pp, &uvn->u_obj.memq, fq.queues.listq) { if ((pp->pg_flags & PG_BUSY) == 0) panic("uvm_vnp_terminate: detected unbusy pg"); } diff --git a/sys/xfs/xfs_vnodeops-bsd.c b/sys/xfs/xfs_vnodeops-bsd.c index ed74d6d478d..3256b1c0160 100644 --- a/sys/xfs/xfs_vnodeops-bsd.c +++ b/sys/xfs/xfs_vnodeops-bsd.c @@ -1119,7 +1119,7 @@ xfs_putpages (struct vop_putpages_args *ap) while (pg && !dirty) { dirty = pmap_is_modified(pg) || (pg->flags & PG_CLEAN) == 0; - pg = TAILQ_NEXT(pg, listq); + pg = TAILQ_NEXT(pg, fq.queues.listq); } if (dirty) |