From 193e3efb70083a72f3d299ea5f129cf83d547115 Mon Sep 17 00:00:00 2001 From: Ariane van der Steldt Date: Fri, 9 Mar 2012 13:01:30 +0000 Subject: New vmmap implementation. no oks (it is really a pain to review properly) extensively tested, I'm confident it'll be stable 'now is the time' from several icb inhabitants Diff provides: - ability to specify different allocators for different regions/maps - a simpler implementation of the current allocator - currently in compatibility mode: it will generate similar addresses as the old allocator --- sys/arch/i386/i386/pmap.c | 25 +- sys/conf/files | 3 +- sys/dev/pci/drm/drm_bufs.c | 4 +- sys/dev/pci/drm/i915_drv.c | 8 +- sys/kern/exec_elf.c | 26 +- sys/kern/kern_exec.c | 6 +- sys/kern/kern_malloc.c | 11 +- sys/kern/sysv_shm.c | 8 +- sys/uvm/uvm.h | 20 +- sys/uvm/uvm_addr.c | 1556 ++++++++++ sys/uvm/uvm_addr.h | 116 + sys/uvm/uvm_extern.h | 8 +- sys/uvm/uvm_fault.c | 15 +- sys/uvm/uvm_init.c | 14 +- sys/uvm/uvm_io.c | 15 +- sys/uvm/uvm_km.c | 78 +- sys/uvm/uvm_map.c | 6986 ++++++++++++++++++++++++++------------------ sys/uvm/uvm_map.h | 307 +- sys/uvm/uvm_mmap.c | 81 +- sys/uvm/uvm_unix.c | 10 +- 20 files changed, 6100 insertions(+), 3197 deletions(-) create mode 100644 sys/uvm/uvm_addr.c create mode 100644 sys/uvm/uvm_addr.h (limited to 'sys') diff --git a/sys/arch/i386/i386/pmap.c b/sys/arch/i386/i386/pmap.c index 2cdfba314d7..f8f05cb8b88 100644 --- a/sys/arch/i386/i386/pmap.c +++ b/sys/arch/i386/i386/pmap.c @@ -1,4 +1,4 @@ -/* $OpenBSD: pmap.c,v 1.156 2012/02/19 17:14:28 kettenis Exp $ */ +/* $OpenBSD: pmap.c,v 1.157 2012/03/09 13:01:28 ariane Exp $ */ /* $NetBSD: pmap.c,v 1.91 2000/06/02 17:46:37 thorpej Exp $ */ /* @@ -602,14 +602,16 @@ pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb) vaddr_t va = 0; vm_map_lock(map); - for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) { - /* - * This entry has greater va than the entries before. - * We need to make it point to the last page, not past it. - */ + RB_FOREACH_REVERSE(ent, uvm_map_addr, &map->addr) { if (ent->protection & VM_PROT_EXECUTE) - va = trunc_page(ent->end - 1); + break; } + /* + * This entry has greater va than the entries before. + * We need to make it point to the last page, not past it. + */ + if (ent) + va = trunc_page(ent->end - 1); vm_map_unlock(map); if (va <= pm->pm_hiexec) { @@ -1244,7 +1246,7 @@ pmap_free_pvpage(void) { int s; struct vm_map *map; - struct vm_map_entry *dead_entries; + struct uvm_map_deadq dead_entries; struct pv_page *pvp; s = splvm(); /* protect kmem_map */ @@ -1265,13 +1267,12 @@ pmap_free_pvpage(void) TAILQ_REMOVE(&pv_unusedpgs, pvp, pvinfo.pvpi_list); /* unmap the page */ - dead_entries = NULL; + TAILQ_INIT(&dead_entries); uvm_unmap_remove(map, (vaddr_t)pvp, ((vaddr_t)pvp) + PAGE_SIZE, - &dead_entries, NULL, FALSE); + &dead_entries, FALSE, TRUE); vm_map_unlock(map); - if (dead_entries != NULL) - uvm_unmap_detach(dead_entries, 0); + uvm_unmap_detach(&dead_entries, 0); pv_nfpvents -= PVE_PER_PVPAGE; /* update free count */ } diff --git a/sys/conf/files b/sys/conf/files index 5f12fdfdc26..379b1e2cdb8 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -1,4 +1,4 @@ -# $OpenBSD: files,v 1.533 2011/12/31 17:06:10 jsing Exp $ +# $OpenBSD: files,v 1.534 2012/03/09 13:01:28 ariane Exp $ # $NetBSD: files,v 1.87 1996/05/19 17:17:50 jonathan Exp $ # @(#)files.newconf 7.5 (Berkeley) 5/10/93 @@ -987,6 +987,7 @@ file nnpfs/nnpfs_vfsops-openbsd.c nnpfs file nnpfs/nnpfs_vnodeops-bsd.c nnpfs file nnpfs/nnpfs_vnodeops-common.c nnpfs file nnpfs/nnpfs_syscalls-dummy.c !nnpfs +file uvm/uvm_addr.c file uvm/uvm_amap.c file uvm/uvm_anon.c file uvm/uvm_aobj.c diff --git a/sys/dev/pci/drm/drm_bufs.c b/sys/dev/pci/drm/drm_bufs.c index 08df8480a55..2f64aa271fa 100644 --- a/sys/dev/pci/drm/drm_bufs.c +++ b/sys/dev/pci/drm/drm_bufs.c @@ -1,4 +1,4 @@ -/* $OpenBSD: drm_bufs.c,v 1.48 2011/06/02 18:22:00 weerd Exp $ */ +/* $OpenBSD: drm_bufs.c,v 1.49 2012/03/09 13:01:28 ariane Exp $ */ /*- * Copyright 1999, 2000 Precision Insight, Inc., Cedar Park, Texas. * Copyright 2000 VA Linux Systems, Inc., Sunnyvale, California. @@ -958,7 +958,7 @@ drm_mapbufs(struct drm_device *dev, void *data, struct drm_file *file_priv) foff = 0; } - vaddr = uvm_map_hint(curproc, VM_PROT_READ | VM_PROT_WRITE); + vaddr = 0; retcode = uvm_mmap(&curproc->p_vmspace->vm_map, &vaddr, size, UVM_PROT_READ | UVM_PROT_WRITE, UVM_PROT_ALL, MAP_SHARED, (caddr_t)vn, foff, curproc->p_rlimit[RLIMIT_MEMLOCK].rlim_cur, diff --git a/sys/dev/pci/drm/i915_drv.c b/sys/dev/pci/drm/i915_drv.c index 76ad35fe01a..602bac7a77e 100644 --- a/sys/dev/pci/drm/i915_drv.c +++ b/sys/dev/pci/drm/i915_drv.c @@ -1,4 +1,4 @@ -/* $OpenBSD: i915_drv.c,v 1.118 2011/09/20 14:29:34 kettenis Exp $ */ +/* $OpenBSD: i915_drv.c,v 1.119 2012/03/09 13:01:28 ariane Exp $ */ /* * Copyright (c) 2008-2009 Owain G. Ainsworth * @@ -1438,10 +1438,10 @@ i915_gem_gtt_map_ioctl(struct drm_device *dev, void *data, * We give our reference from object_lookup to the mmap, so only * must free it in the case that the map fails. */ - addr = uvm_map_hint(curproc, VM_PROT_READ | VM_PROT_WRITE); - ret = uvm_map_p(&curproc->p_vmspace->vm_map, &addr, nsize, &obj->uobj, + addr = 0; + ret = uvm_map(&curproc->p_vmspace->vm_map, &addr, nsize, &obj->uobj, offset, 0, UVM_MAPFLAG(UVM_PROT_RW, UVM_PROT_RW, - UVM_INH_SHARE, UVM_ADV_RANDOM, 0), curproc); + UVM_INH_SHARE, UVM_ADV_RANDOM, 0)); done: if (ret == 0) diff --git a/sys/kern/exec_elf.c b/sys/kern/exec_elf.c index 2e615de374d..4e9f314965f 100644 --- a/sys/kern/exec_elf.c +++ b/sys/kern/exec_elf.c @@ -1,4 +1,4 @@ -/* $OpenBSD: exec_elf.c,v 1.85 2011/07/05 04:48:02 guenther Exp $ */ +/* $OpenBSD: exec_elf.c,v 1.86 2012/03/09 13:01:28 ariane Exp $ */ /* * Copyright (c) 1996 Per Fogelstrom @@ -326,6 +326,7 @@ ELFNAME(load_file)(struct proc *p, char *path, struct exec_package *epp, int nload, idx = 0; Elf_Addr pos = *last; int file_align; + int loop; NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, path, p); if ((error = namei(&nd)) != 0) { @@ -377,11 +378,12 @@ ELFNAME(load_file)(struct proc *p, char *path, struct exec_package *epp, * would (i.e. something safely out of the way). */ if (pos == ELFDEFNNAME(NO_ADDR)) { - pos = uvm_map_hint(p, VM_PROT_EXECUTE); + pos = uvm_map_hint(p->p_vmspace, VM_PROT_EXECUTE); } pos = ELF_ROUND(pos, file_align); *last = epp->ep_interp_pos = pos; + loop = 0; for (i = 0; i < nload;/**/) { vaddr_t addr; struct uvm_object *uobj; @@ -409,17 +411,17 @@ ELFNAME(load_file)(struct proc *p, char *path, struct exec_package *epp, addr = round_page((vaddr_t)p->p_vmspace->vm_daddr + BRKSIZ); - vm_map_lock(&p->p_vmspace->vm_map); - if (uvm_map_findspace(&p->p_vmspace->vm_map, addr, size, - &addr, uobj, uoff, 0, UVM_FLAG_FIXED) == NULL) { - if (uvm_map_findspace(&p->p_vmspace->vm_map, addr, size, - &addr, uobj, uoff, 0, 0) == NULL) { - error = ENOMEM; /* XXX */ - vm_map_unlock(&p->p_vmspace->vm_map); - goto bad1; + if (uvm_map_mquery(&p->p_vmspace->vm_map, &addr, size, + (i == 0 ? uoff : UVM_UNKNOWN_OFFSET), 0) != 0) { + if (loop == 0) { + loop = 1; + i = 0; + *last = epp->ep_interp_pos = pos = 0; + continue; } - } - vm_map_unlock(&p->p_vmspace->vm_map); + error = ENOMEM; + goto bad1; + } if (addr != pos + loadmap[i].vaddr) { /* base changed. */ pos = addr - trunc_page(loadmap[i].vaddr); diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c index c8c2f1c1378..4fa96597e89 100644 --- a/sys/kern/kern_exec.c +++ b/sys/kern/kern_exec.c @@ -1,4 +1,4 @@ -/* $OpenBSD: kern_exec.c,v 1.124 2012/02/20 22:23:39 guenther Exp $ */ +/* $OpenBSD: kern_exec.c,v 1.125 2012/03/09 13:01:28 ariane Exp $ */ /* $NetBSD: kern_exec.c,v 1.75 1996/02/09 18:59:28 christos Exp $ */ /*- @@ -821,7 +821,6 @@ exec_sigcode_map(struct proc *p, struct emul *e) e->e_sigobject = uao_create(sz, 0); uao_reference(e->e_sigobject); /* permanent reference */ - va = vm_map_min(kernel_map); /* hint */ if ((r = uvm_map(kernel_map, &va, round_page(sz), e->e_sigobject, 0, 0, UVM_MAPFLAG(UVM_PROT_RW, UVM_PROT_RW, UVM_INH_SHARE, UVM_ADV_RANDOM, 0)))) { @@ -832,8 +831,7 @@ exec_sigcode_map(struct proc *p, struct emul *e) uvm_unmap(kernel_map, va, va + round_page(sz)); } - /* Just a hint to uvm_mmap where to put it. */ - p->p_sigcode = uvm_map_hint(p, VM_PROT_READ|VM_PROT_EXECUTE); + p->p_sigcode = 0; /* no hint */ uao_reference(e->e_sigobject); if (uvm_map(&p->p_vmspace->vm_map, &p->p_sigcode, round_page(sz), e->e_sigobject, 0, 0, UVM_MAPFLAG(UVM_PROT_RX, UVM_PROT_RX, diff --git a/sys/kern/kern_malloc.c b/sys/kern/kern_malloc.c index d6fc8d3ac20..6c5f9e8bfdd 100644 --- a/sys/kern/kern_malloc.c +++ b/sys/kern/kern_malloc.c @@ -1,4 +1,4 @@ -/* $OpenBSD: kern_malloc.c,v 1.90 2011/09/22 21:52:36 jsing Exp $ */ +/* $OpenBSD: kern_malloc.c,v 1.91 2012/03/09 13:01:28 ariane Exp $ */ /* $NetBSD: kern_malloc.c,v 1.15.4.2 1996/06/13 17:10:56 cgd Exp $ */ /* @@ -576,8 +576,13 @@ kmeminit(void) kmeminit_nkmempages(); base = vm_map_min(kernel_map); kmem_map = uvm_km_suballoc(kernel_map, &base, &limit, - (vsize_t)(nkmempages * PAGE_SIZE), VM_MAP_INTRSAFE, FALSE, - &kmem_map_store); + (vsize_t)nkmempages << PAGE_SHIFT, +#ifdef KVA_GUARDPAGES + VM_MAP_INTRSAFE | VM_MAP_GUARDPAGES, +#else + VM_MAP_INTRSAFE, +#endif + FALSE, &kmem_map_store); kmembase = (char *)base; kmemlimit = (char *)limit; kmemusage = (struct kmemusage *) uvm_km_zalloc(kernel_map, diff --git a/sys/kern/sysv_shm.c b/sys/kern/sysv_shm.c index 6b4b0ed18ce..6b6736c153d 100644 --- a/sys/kern/sysv_shm.c +++ b/sys/kern/sysv_shm.c @@ -1,4 +1,4 @@ -/* $OpenBSD: sysv_shm.c,v 1.54 2011/10/27 07:56:28 robert Exp $ */ +/* $OpenBSD: sysv_shm.c,v 1.55 2012/03/09 13:01:28 ariane Exp $ */ /* $NetBSD: sysv_shm.c,v 1.50 1998/10/21 22:24:29 tron Exp $ */ /* @@ -261,10 +261,8 @@ sys_shmat(struct proc *p, void *v, register_t *retval) attach_va = (vaddr_t)SCARG(uap, shmaddr); else return (EINVAL); - } else { - /* This is just a hint to uvm_map() about where to put it. */ - attach_va = uvm_map_hint(p, prot); - } + } else + attach_va = 0; shm_handle = shmseg->shm_internal; uao_reference(shm_handle->shm_object); error = uvm_map(&p->p_vmspace->vm_map, &attach_va, size, diff --git a/sys/uvm/uvm.h b/sys/uvm/uvm.h index 939738f47aa..c236fb421a9 100644 --- a/sys/uvm/uvm.h +++ b/sys/uvm/uvm.h @@ -1,4 +1,4 @@ -/* $OpenBSD: uvm.h,v 1.46 2011/07/06 19:50:38 beck Exp $ */ +/* $OpenBSD: uvm.h,v 1.47 2012/03/09 13:01:29 ariane Exp $ */ /* $NetBSD: uvm.h,v 1.24 2000/11/27 08:40:02 chs Exp $ */ /* @@ -120,6 +120,7 @@ struct uvm { #define UVM_ET_COPYONWRITE 0x04 /* copy_on_write */ #define UVM_ET_NEEDSCOPY 0x08 /* needs_copy */ #define UVM_ET_HOLE 0x10 /* no backend */ +#define UVM_ET_FREEMAPPED 0x80 /* map entry is on free list (DEBUG) */ #define UVM_ET_ISOBJ(E) (((E)->etype & UVM_ET_OBJ) != 0) #define UVM_ET_ISSUBMAP(E) (((E)->etype & UVM_ET_SUBMAP) != 0) @@ -154,6 +155,23 @@ do { \ #define UVM_PAGE_OWN(PG, TAG) /* nothing */ #endif /* UVM_PAGE_TRKOWN */ +/* + * uvm_map internal functions. + * Used by uvm_map address selectors. + */ + +struct vm_map_entry *uvm_map_entrybyaddr(struct uvm_map_addr*, vaddr_t); +int uvm_map_isavail(struct vm_map*, + struct uvm_addr_state*, + struct vm_map_entry**, struct vm_map_entry**, + vaddr_t, vsize_t); +struct uvm_addr_state *uvm_map_uaddr(struct vm_map*, vaddr_t); +struct uvm_addr_state *uvm_map_uaddr_e(struct vm_map*, struct vm_map_entry*); + +#define VMMAP_FREE_START(_entry) ((_entry)->end + (_entry)->guard) +#define VMMAP_FREE_END(_entry) ((_entry)->end + (_entry)->guard + \ + (_entry)->fspace) + #endif /* _KERNEL */ #endif /* _UVM_UVM_H_ */ diff --git a/sys/uvm/uvm_addr.c b/sys/uvm/uvm_addr.c new file mode 100644 index 00000000000..486198e3891 --- /dev/null +++ b/sys/uvm/uvm_addr.c @@ -0,0 +1,1556 @@ +/* $OpenBSD: uvm_addr.c,v 1.1 2012/03/09 13:01:29 ariane Exp $ */ + +/* + * Copyright (c) 2011 Ariane van der Steldt + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/* #define DEBUG */ + +#include +#include +#include +#include +#include + +/* Max gap between hint allocations. */ +#define UADDR_HINT_MAXGAP (4 * PAGE_SIZE) +/* Number of pivots in pivot allocator. */ +#define NUM_PIVOTS 16 +/* + * Max number (inclusive) of pages the pivot allocator + * will place between allocations. + * + * The uaddr_pivot_random() function attempts to bias towards + * small space between allocations, so putting a large number here is fine. + */ +#define PIVOT_RND 8 +/* + * Number of allocations that a pivot can supply before expiring. + * When a pivot expires, a new pivot has to be found. + * + * Must be at least 1. + */ +#define PIVOT_EXPIRE 1024 + + +/* Pool with uvm_addr_state structures. */ +struct pool uaddr_pool; +struct pool uaddr_hint_pool; +struct pool uaddr_bestfit_pool; +struct pool uaddr_pivot_pool; +struct pool uaddr_rnd_pool; + +/* uvm_addr state for hint based selector. */ +struct uaddr_hint_state { + struct uvm_addr_state uaddr; + vsize_t max_dist; +}; + +/* uvm_addr state for bestfit selector. */ +struct uaddr_bestfit_state { + struct uvm_addr_state ubf_uaddr; + struct uaddr_free_rbtree ubf_free; +}; + +/* uvm_addr state for rnd selector. */ +struct uaddr_rnd_state { + struct uvm_addr_state ur_uaddr; + TAILQ_HEAD(, vm_map_entry) ur_free; +}; + +/* + * Definition of a pivot in pivot selector. + */ +struct uaddr_pivot { + vaddr_t addr; /* End of prev. allocation. */ + int expire;/* Best before date. */ + int dir; /* Direction. */ + struct vm_map_entry *entry; /* Will contain next alloc. */ +}; +/* uvm_addr state for pivot selector. */ +struct uaddr_pivot_state { + struct uvm_addr_state up_uaddr; + + /* Free space tree, for fast pivot selection. */ + struct uaddr_free_rbtree up_free; + + /* List of pivots. The pointers point to after the last allocation. */ + struct uaddr_pivot up_pivots[NUM_PIVOTS]; +}; + +/* + * Free space comparison. + * Compares smaller free-space before larger free-space. + */ +static __inline int +uvm_mapent_fspace_cmp(struct vm_map_entry *e1, struct vm_map_entry *e2) +{ + if (e1->fspace != e2->fspace) + return (e1->fspace < e2->fspace ? -1 : 1); + return (e1->start < e2->start ? -1 : e1->start > e2->start); +} + +/* Forward declaration (see below). */ +extern const struct uvm_addr_functions uaddr_kernel_functions; +struct uvm_addr_state uaddr_kbootstrap; + + +/* + * Support functions. + */ + +struct vm_map_entry *uvm_addr_entrybyspace(struct uaddr_free_rbtree*, + vsize_t); +void uaddr_kinsert(struct vm_map*, struct uvm_addr_state*, + struct vm_map_entry*); +void uaddr_kremove(struct vm_map*, struct uvm_addr_state*, + struct vm_map_entry*); +void uaddr_kbootstrapdestroy(struct uvm_addr_state*); + +void uaddr_destroy(struct uvm_addr_state*); +void uaddr_hint_destroy(struct uvm_addr_state*); +void uaddr_kbootstrap_destroy(struct uvm_addr_state*); +void uaddr_rnd_destroy(struct uvm_addr_state*); +void uaddr_bestfit_destroy(struct uvm_addr_state*); +void uaddr_pivot_destroy(struct uvm_addr_state*); + +int uaddr_lin_select(struct vm_map*, + struct uvm_addr_state*, struct vm_map_entry**, + vaddr_t*, vsize_t, vaddr_t, vaddr_t, vm_prot_t, + vaddr_t); +int uaddr_kbootstrap_select(struct vm_map*, + struct uvm_addr_state*, struct vm_map_entry**, + vaddr_t*, vsize_t, vaddr_t, vaddr_t, vm_prot_t, + vaddr_t); +int uaddr_rnd_select(struct vm_map*, + struct uvm_addr_state*, struct vm_map_entry**, + vaddr_t*, vsize_t, vaddr_t, vaddr_t, vm_prot_t, + vaddr_t); +int uaddr_hint_select(struct vm_map*, + struct uvm_addr_state*, struct vm_map_entry**, + vaddr_t*, vsize_t, vaddr_t, vaddr_t, vm_prot_t, + vaddr_t); +int uaddr_bestfit_select(struct vm_map*, + struct uvm_addr_state*, struct vm_map_entry**, + vaddr_t*, vsize_t, vaddr_t, vaddr_t, vm_prot_t, + vaddr_t); +int uaddr_pivot_select(struct vm_map*, + struct uvm_addr_state*, struct vm_map_entry**, + vaddr_t*, vsize_t, vaddr_t, vaddr_t, vm_prot_t, + vaddr_t); +int uaddr_stack_brk_select(struct vm_map*, + struct uvm_addr_state*, struct vm_map_entry**, + vaddr_t*, vsize_t, vaddr_t, vaddr_t, vm_prot_t, + vaddr_t); + +void uaddr_rnd_insert(struct vm_map*, + struct uvm_addr_state*, struct vm_map_entry*); +void uaddr_rnd_remove(struct vm_map*, + struct uvm_addr_state*, struct vm_map_entry*); +void uaddr_bestfit_insert(struct vm_map*, + struct uvm_addr_state*, struct vm_map_entry*); +void uaddr_bestfit_remove(struct vm_map*, + struct uvm_addr_state*, struct vm_map_entry*); +void uaddr_pivot_insert(struct vm_map*, + struct uvm_addr_state*, struct vm_map_entry*); +void uaddr_pivot_remove(struct vm_map*, + struct uvm_addr_state*, struct vm_map_entry*); + +vsize_t uaddr_pivot_random(void); +int uaddr_pivot_newpivot(struct vm_map*, + struct uaddr_pivot_state*, struct uaddr_pivot*, + struct vm_map_entry**, vaddr_t*, + vsize_t, vaddr_t, vaddr_t, vsize_t, vsize_t); + +#if defined(DEBUG) || defined(DDB) +void uaddr_pivot_print(struct uvm_addr_state*, boolean_t, + int (*)(const char*, ...)); +void uaddr_rnd_print(struct uvm_addr_state*, boolean_t, + int (*)(const char*, ...)); +#endif /* DEBUG || DDB */ + + +/* + * Find smallest entry in tree that will fit sz bytes. + */ +struct vm_map_entry* +uvm_addr_entrybyspace(struct uaddr_free_rbtree *free, vsize_t sz) +{ + struct vm_map_entry *tmp, *res; + + tmp = RB_ROOT(free); + res = NULL; + while (tmp) { + if (tmp->fspace >= sz) { + res = tmp; + tmp = RB_LEFT(tmp, dfree.rbtree); + } else if (tmp->fspace < sz) + tmp = RB_RIGHT(tmp, dfree.rbtree); + } + return res; +} + +static __inline vaddr_t +uvm_addr_align_forward(vaddr_t addr, vaddr_t align, vaddr_t offset) +{ + vaddr_t adjusted; + + KASSERT(offset < align || (align == 0 && offset == 0)); + KASSERT((align & (align - 1)) == 0); + KASSERT((offset & PAGE_MASK) == 0); + + align = MAX(align, PAGE_SIZE); + adjusted = addr & ~(align - 1); + adjusted += offset; + return (adjusted < addr ? adjusted + align : adjusted); +} + +static __inline vaddr_t +uvm_addr_align_backward(vaddr_t addr, vaddr_t align, vaddr_t offset) +{ + vaddr_t adjusted; + + KASSERT(offset < align || (align == 0 && offset == 0)); + KASSERT((align & (align - 1)) == 0); + KASSERT((offset & PAGE_MASK) == 0); + + align = MAX(align, PAGE_SIZE); + adjusted = addr & ~(align - 1); + adjusted += offset; + return (adjusted > addr ? adjusted - align : adjusted); +} + +/* + * Try to fit the requested space into the entry. + */ +int +uvm_addr_fitspace(vaddr_t *min_result, vaddr_t *max_result, + vaddr_t low_addr, vaddr_t high_addr, vsize_t sz, + vaddr_t align, vaddr_t offset, + vsize_t before_gap, vsize_t after_gap) +{ + vaddr_t tmp; + vsize_t fspace; + + if (low_addr > high_addr) + return ENOMEM; + fspace = high_addr - low_addr; + if (fspace < sz + before_gap + after_gap) + return ENOMEM; + + /* + * Calculate lowest address. + */ + low_addr += before_gap; + low_addr = uvm_addr_align_forward(tmp = low_addr, align, offset); + if (low_addr < tmp) /* Overflow during alignment. */ + return ENOMEM; + if (high_addr - after_gap - sz < low_addr) + return ENOMEM; + + /* + * Calculate highest address. + */ + high_addr -= after_gap + sz; + high_addr = uvm_addr_align_backward(tmp = high_addr, align, offset); + if (high_addr > tmp) /* Overflow during alignment. */ + return ENOMEM; + if (low_addr > high_addr) + return ENOMEM; + + *min_result = low_addr; + *max_result = high_addr; + return 0; +} + + +/* + * Initialize uvm_addr. + */ +void +uvm_addr_init() +{ + pool_init(&uaddr_pool, sizeof(struct uvm_addr_state), + 0, 0, 0, "uaddr", &pool_allocator_nointr); + pool_init(&uaddr_hint_pool, sizeof(struct uaddr_hint_state), + 0, 0, 0, "uaddrhint", &pool_allocator_nointr); + pool_init(&uaddr_bestfit_pool, sizeof(struct uaddr_bestfit_state), + 0, 0, 0, "uaddrbestfit", &pool_allocator_nointr); + pool_init(&uaddr_pivot_pool, sizeof(struct uaddr_pivot_state), + 0, 0, 0, "uaddrpivot", &pool_allocator_nointr); + pool_init(&uaddr_rnd_pool, sizeof(struct uaddr_rnd_state), + 0, 0, 0, "uaddrrnd", &pool_allocator_nointr); + + uaddr_kbootstrap.uaddr_minaddr = PAGE_SIZE; + uaddr_kbootstrap.uaddr_maxaddr = -(vaddr_t)PAGE_SIZE; + uaddr_kbootstrap.uaddr_functions = &uaddr_kernel_functions; +} + +/* + * Invoke destructor function of uaddr. + */ +void +uvm_addr_destroy(struct uvm_addr_state *uaddr) +{ + if (uaddr) + (*uaddr->uaddr_functions->uaddr_destroy)(uaddr); +} + +/* + * Move address forward to satisfy align, offset. + */ +vaddr_t +uvm_addr_align(vaddr_t addr, vaddr_t align, vaddr_t offset) +{ + vaddr_t result = (addr & ~(align - 1)) + offset; + if (result < addr) + result += align; + return result; +} + +/* + * Move address backwards to satisfy align, offset. + */ +vaddr_t +uvm_addr_align_back(vaddr_t addr, vaddr_t align, vaddr_t offset) +{ + vaddr_t result = (addr & ~(align - 1)) + offset; + if (result > addr) + result -= align; + return result; +} + +/* + * Directional first fit. + * + * Do a lineair search for free space, starting at addr in entry. + * direction == 1: search forward + * direction == -1: search backward + * + * Output: low <= addr <= high and entry will contain addr. + * 0 will be returned if no space is available. + * + * gap describes the space that must appear between the preceding entry. + */ +int +uvm_addr_linsearch(struct vm_map *map, struct uvm_addr_state *uaddr, + struct vm_map_entry**entry_out, vaddr_t *addr_out, + vaddr_t hint, vsize_t sz, vaddr_t align, vaddr_t offset, + int direction, vaddr_t low, vaddr_t high, + vsize_t before_gap, vsize_t after_gap) +{ + struct vm_map_entry *entry; + vaddr_t low_addr, high_addr; + + KASSERT(entry_out != NULL && addr_out != NULL); + KASSERT(direction == -1 || direction == 1); + KASSERT((hint & PAGE_MASK) == 0 && (high & PAGE_MASK) == 0 && + (low & PAGE_MASK) == 0 && + (before_gap & PAGE_MASK) == 0 && (after_gap & PAGE_MASK) == 0); + KASSERT(high + sz > high); /* Check for overflow. */ + + /* + * Hint magic. + */ + if (hint == 0) + hint = (direction == 1 ? low : high); + else if (hint > high) { + if (direction != -1) + return ENOMEM; + hint = high; + } else if (hint < low) { + if (direction != 1) + return ENOMEM; + hint = low; + } + + for (entry = uvm_map_entrybyaddr(&map->addr, + hint - (direction == -1 ? 1 : 0)); entry != NULL; + entry = (direction == 1 ? + RB_NEXT(uvm_map_addr, &map->addr, entry) : + RB_PREV(uvm_map_addr, &map->addr, entry))) { + if (VMMAP_FREE_START(entry) > high || + VMMAP_FREE_END(entry) < low) { + break; + } + + if (uvm_addr_fitspace(&low_addr, &high_addr, + MAX(low, VMMAP_FREE_START(entry)), + MIN(high, VMMAP_FREE_END(entry)), + sz, align, offset, before_gap, after_gap) == 0) { + *entry_out = entry; + if (hint >= low_addr && hint <= high_addr) { + *addr_out = hint; + } else { + *addr_out = (direction == 1 ? + low_addr : high_addr); + } + return 0; + } + } + + return ENOMEM; +} + +/* + * Invoke address selector of uaddr. + * uaddr may be NULL, in which case the algorithm will fail with ENOMEM. + * + * Will invoke uvm_addr_isavail to fill in last_out. + */ +int +uvm_addr_invoke(struct vm_map *map, struct uvm_addr_state *uaddr, + struct vm_map_entry**entry_out, struct vm_map_entry**last_out, + vaddr_t *addr_out, + vsize_t sz, vaddr_t align, vaddr_t offset, vm_prot_t prot, vaddr_t hint) +{ + int error; + + if (uaddr == NULL) + return ENOMEM; + + hint &= ~((vaddr_t)PAGE_MASK); + if (hint != 0 && + !(hint >= uaddr->uaddr_minaddr && hint < uaddr->uaddr_maxaddr)) + return ENOMEM; + + error = (*uaddr->uaddr_functions->uaddr_select)(map, uaddr, + entry_out, addr_out, sz, align, offset, prot, hint); + + if (error == 0) { + KASSERT(*entry_out != NULL); + *last_out = NULL; + if (!uvm_map_isavail(map, uaddr, entry_out, last_out, + *addr_out, sz)) { + panic("uvm_addr_invoke: address selector %p " + "(%s 0x%lx-0x%lx) " + "returned unavailable address 0x%lx", + uaddr, uaddr->uaddr_functions->uaddr_name, + uaddr->uaddr_minaddr, uaddr->uaddr_maxaddr, + *addr_out); + } + } + + return error; +} + +#if defined(DEBUG) || defined(DDB) +void +uvm_addr_print(struct uvm_addr_state *uaddr, const char *slot, boolean_t full, + int (*pr)(const char*, ...)) +{ + if (uaddr == NULL) { + (*pr)("- uvm_addr %s: NULL\n", slot); + return; + } + + (*pr)("- uvm_addr %s: %p (%s 0x%lx-0x%lx)\n", slot, uaddr, + uaddr->uaddr_functions->uaddr_name, + uaddr->uaddr_minaddr, uaddr->uaddr_maxaddr); + if (uaddr->uaddr_functions->uaddr_print == NULL) + return; + + (*uaddr->uaddr_functions->uaddr_print)(uaddr, full, pr); +} +#endif /* DEBUG || DDB */ + +/* + * Destroy a uvm_addr_state structure. + * The uaddr must have been previously allocated from uaddr_state_pool. + */ +void +uaddr_destroy(struct uvm_addr_state *uaddr) +{ + pool_put(&uaddr_pool, uaddr); +} + + +/* + * Lineair allocator. + * This allocator uses a first-fit algorithm. + * + * If hint is set, search will start at the hint position. + * Only searches forward. + */ + +const struct uvm_addr_functions uaddr_lin_functions = { + .uaddr_select = &uaddr_lin_select, + .uaddr_destroy = &uaddr_destroy, + .uaddr_name = "uaddr_lin" +}; + +struct uvm_addr_state* +uaddr_lin_create(vaddr_t minaddr, vaddr_t maxaddr) +{ + struct uvm_addr_state* uaddr; + + uaddr = pool_get(&uaddr_pool, PR_WAITOK); + uaddr->uaddr_minaddr = minaddr; + uaddr->uaddr_maxaddr = maxaddr; + uaddr->uaddr_functions = &uaddr_lin_functions; + return uaddr; +} + +int +uaddr_lin_select(struct vm_map *map, struct uvm_addr_state *uaddr, + struct vm_map_entry**entry_out, vaddr_t *addr_out, + vsize_t sz, vaddr_t align, vaddr_t offset, + vm_prot_t prot, vaddr_t hint) +{ + vaddr_t guard_sz; + + /* + * Deal with guardpages: search for space with one extra page. + */ + guard_sz = ((map->flags & VM_MAP_GUARDPAGES) == 0 ? 0 : PAGE_SIZE); + + if (uaddr->uaddr_maxaddr - uaddr->uaddr_minaddr < sz + guard_sz) + return ENOMEM; + return uvm_addr_linsearch(map, uaddr, entry_out, addr_out, 0, sz, + align, offset, 1, uaddr->uaddr_minaddr, uaddr->uaddr_maxaddr - sz, + 0, guard_sz); +} + + +/* + * Randomized allocator. + * This allocator use uvm_map_hint to acquire a random address and searches + * from there. + */ + +const struct uvm_addr_functions uaddr_rnd_functions = { + .uaddr_select = &uaddr_rnd_select, + .uaddr_free_insert = &uaddr_rnd_insert, + .uaddr_free_remove = &uaddr_rnd_remove, + .uaddr_destroy = &uaddr_rnd_destroy, +#if defined(DEBUG) || defined(DDB) + .uaddr_print = &uaddr_rnd_print, +#endif /* DEBUG || DDB */ + .uaddr_name = "uaddr_rnd" +}; + +struct uvm_addr_state* +uaddr_rnd_create(vaddr_t minaddr, vaddr_t maxaddr) +{ + struct uaddr_rnd_state* uaddr; + + uaddr = pool_get(&uaddr_rnd_pool, PR_WAITOK); + uaddr->ur_uaddr.uaddr_minaddr = minaddr; + uaddr->ur_uaddr.uaddr_maxaddr = maxaddr; + uaddr->ur_uaddr.uaddr_functions = &uaddr_rnd_functions; + TAILQ_INIT(&uaddr->ur_free); + return &uaddr->ur_uaddr; +} + +int +uaddr_rnd_select(struct vm_map *map, struct uvm_addr_state *uaddr, + struct vm_map_entry**entry_out, vaddr_t *addr_out, + vsize_t sz, vaddr_t align, vaddr_t offset, + vm_prot_t prot, vaddr_t hint) +{ + struct vmspace *vm; + vaddr_t guard_sz; + vaddr_t low_addr, high_addr; + struct vm_map_entry *entry; + vsize_t before_gap, after_gap; + vaddr_t tmp; + + KASSERT((map->flags & VM_MAP_ISVMSPACE) != 0); + vm = (struct vmspace*)map; + + /* Deal with guardpages: search for space with one extra page. */ + guard_sz = ((map->flags & VM_MAP_GUARDPAGES) == 0 ? 0 : PAGE_SIZE); + + /* Quick fail if the allocation won't fit. */ + if (uaddr->uaddr_maxaddr - uaddr->uaddr_minaddr < sz + guard_sz) + return ENOMEM; + + /* Select a hint. */ + if (hint == 0) + hint = uvm_map_hint(vm, prot); + /* Clamp hint to uaddr range. */ + hint = MIN(MAX(hint, uaddr->uaddr_minaddr), + uaddr->uaddr_maxaddr - sz - guard_sz); + + /* Align hint to align,offset parameters. */ + tmp = hint; + hint = uvm_addr_align_forward(tmp, align, offset); + /* Check for overflow during alignment. */ + if (hint < tmp || hint > uaddr->uaddr_maxaddr - sz - guard_sz) + return ENOMEM; /* Compatibility mode: never look backwards. */ + + before_gap = 0; + after_gap = guard_sz; + + /* + * Find the first entry at or after hint with free space. + * + * Since we need an entry that is on the free-list, search until + * we hit an entry that is owned by our uaddr. + */ + for (entry = uvm_map_entrybyaddr(&map->addr, hint); + entry != NULL && + uvm_map_uaddr_e(map, entry) != uaddr; + entry = RB_NEXT(uvm_map_addr, &map->addr, entry)) { + /* Fail if we search past uaddr_maxaddr. */ + if (VMMAP_FREE_START(entry) >= uaddr->uaddr_maxaddr) { + entry = NULL; + break; + } + } + + for ( /* initial entry filled in above */ ; + entry != NULL && VMMAP_FREE_START(entry) < uaddr->uaddr_maxaddr; + entry = TAILQ_NEXT(entry, dfree.tailq)) { + if (uvm_addr_fitspace(&low_addr, &high_addr, + MAX(uaddr->uaddr_minaddr, VMMAP_FREE_START(entry)), + MIN(uaddr->uaddr_maxaddr, VMMAP_FREE_END(entry)), + sz, align, offset, before_gap, after_gap) == 0) { + *entry_out = entry; + if (hint >= low_addr && hint <= high_addr) + *addr_out = hint; + else + *addr_out = low_addr; + return 0; + } + } + + return ENOMEM; +} + +/* + * Destroy a uaddr_rnd_state structure. + */ +void +uaddr_rnd_destroy(struct uvm_addr_state *uaddr) +{ + pool_put(&uaddr_rnd_pool, uaddr); +} + +/* + * Add entry to tailq. + */ +void +uaddr_rnd_insert(struct vm_map *map, struct uvm_addr_state *uaddr_p, + struct vm_map_entry *entry) +{ + struct uaddr_rnd_state *uaddr; + struct vm_map_entry *prev; + + uaddr = (struct uaddr_rnd_state*)uaddr_p; + KASSERT(entry == RB_FIND(uvm_map_addr, &map->addr, entry)); + + /* + * Make prev the first vm_map_entry before entry. + */ + for (prev = RB_PREV(uvm_map_addr, &map->addr, entry); + prev != NULL; + prev = RB_PREV(uvm_map_addr, &map->addr, prev)) { + /* Stop and fail when reaching uaddr minaddr. */ + if (VMMAP_FREE_START(prev) < uaddr_p->uaddr_minaddr) { + prev = NULL; + break; + } + + KASSERT(prev->etype & UVM_ET_FREEMAPPED); + if (uvm_map_uaddr_e(map, prev) == uaddr_p) + break; + } + + /* Perform insertion. */ + if (prev == NULL) + TAILQ_INSERT_HEAD(&uaddr->ur_free, entry, dfree.tailq); + else + TAILQ_INSERT_AFTER(&uaddr->ur_free, prev, entry, dfree.tailq); +} + +/* + * Remove entry from tailq. + */ +void +uaddr_rnd_remove(struct vm_map *map, struct uvm_addr_state *uaddr_p, + struct vm_map_entry *entry) +{ + struct uaddr_rnd_state *uaddr; + + uaddr = (struct uaddr_rnd_state*)uaddr_p; + TAILQ_REMOVE(&uaddr->ur_free, entry, dfree.tailq); +} + +#if defined(DEBUG) || defined(DDB) +void +uaddr_rnd_print(struct uvm_addr_state *uaddr_p, boolean_t full, + int (*pr)(const char*, ...)) +{ + struct vm_map_entry *entry; + struct uaddr_rnd_state *uaddr; + vaddr_t addr; + size_t count; + vsize_t space; + + uaddr = (struct uaddr_rnd_state*)uaddr_p; + addr = 0; + count = 0; + space = 0; + TAILQ_FOREACH(entry, &uaddr->ur_free, dfree.tailq) { + count++; + space += entry->fspace; + + if (full) { + (*pr)("\tentry %p: 0x%lx-0x%lx G=0x%lx F=0x%lx\n", + entry, entry->start, entry->end, + entry->guard, entry->fspace); + (*pr)("\t\tfree: 0x%lx-0x%lx\n", + VMMAP_FREE_START(entry), VMMAP_FREE_END(entry)); + } + if (entry->start < addr) { + if (!full) + (*pr)("\tentry %p: 0x%lx-0x%lx " + "G=0x%lx F=0x%lx\n", + entry, entry->start, entry->end, + entry->guard, entry->fspace); + (*pr)("\t\tstart=0x%lx, expected at least 0x%lx\n", + entry->start, addr); + } + + addr = VMMAP_FREE_END(entry); + } + (*pr)("\t0x%lu entries, 0x%lx free bytes\n", count, space); +} +#endif /* DEBUG || DDB */ + + +/* + * An allocator that selects an address within distance of the hint. + * + * If no hint is given, the allocator refuses to allocate. + */ + +const struct uvm_addr_functions uaddr_hint_functions = { + .uaddr_select = &uaddr_hint_select, + .uaddr_destroy = &uaddr_hint_destroy, + .uaddr_name = "uaddr_hint" +}; + +/* + * Create uaddr_hint state. + */ +struct uvm_addr_state* +uaddr_hint_create(vaddr_t minaddr, vaddr_t maxaddr, vsize_t max_dist) +{ + struct uaddr_hint_state* ua_hint; + + KASSERT(uaddr_hint_pool.pr_size == sizeof(*ua_hint)); + + ua_hint = pool_get(&uaddr_hint_pool, PR_WAITOK); + ua_hint->uaddr.uaddr_minaddr = minaddr; + ua_hint->uaddr.uaddr_maxaddr = maxaddr; + ua_hint->uaddr.uaddr_functions = &uaddr_hint_functions; + ua_hint->max_dist = max_dist; + return &ua_hint->uaddr; +} + +/* + * Destroy uaddr_hint state. + */ +void +uaddr_hint_destroy(struct uvm_addr_state *uaddr) +{ + pool_put(&uaddr_hint_pool, uaddr); +} + +/* + * Hint selector. + * + * Attempts to find an address that is within max_dist of the hint. + */ +int +uaddr_hint_select(struct vm_map *map, struct uvm_addr_state *uaddr_param, + struct vm_map_entry**entry_out, vaddr_t *addr_out, + vsize_t sz, vaddr_t align, vaddr_t offset, + vm_prot_t prot, vaddr_t hint) +{ + struct uaddr_hint_state *uaddr = (struct uaddr_hint_state*)uaddr_param; + vsize_t before_gap, after_gap; + vaddr_t low, high; + int dir; + + if (hint == 0) + return ENOMEM; + + /* + * Calculate upper and lower bound for selected address. + */ + high = hint + uaddr->max_dist; + if (high < hint) /* overflow */ + high = map->max_offset; + high = MIN(high, uaddr->uaddr.uaddr_maxaddr); + if (high < sz) + return ENOMEM; /* Protect against underflow. */ + high -= sz; + + /* Calculate lower bound for selected address. */ + low = hint - uaddr->max_dist; + if (low > hint) /* underflow */ + low = map->min_offset; + low = MAX(low, uaddr->uaddr.uaddr_minaddr); + + /* Search strategy setup. */ + before_gap = PAGE_SIZE + + (arc4random_uniform(UADDR_HINT_MAXGAP) & ~(vaddr_t)PAGE_MASK); + after_gap = PAGE_SIZE + + (arc4random_uniform(UADDR_HINT_MAXGAP) & ~(vaddr_t)PAGE_MASK); + dir = (arc4random() & 0x01) ? 1 : -1; + + /* + * Try to search: + * - forward, with gap + * - backward, with gap + * - forward, without gap + * - backward, without gap + * (Where forward is in the direction specified by dir and + * backward is in the direction specified by -dir). + */ + if (uvm_addr_linsearch(map, uaddr_param, + entry_out, addr_out, hint, sz, align, offset, + dir, low, high, before_gap, after_gap) == 0) + return 0; + if (uvm_addr_linsearch(map, uaddr_param, + entry_out, addr_out, hint, sz, align, offset, + -dir, low, high, before_gap, after_gap) == 0) + return 0; + + if (uvm_addr_linsearch(map, uaddr_param, + entry_out, addr_out, hint, sz, align, offset, + dir, low, high, 0, 0) == 0) + return 0; + if (uvm_addr_linsearch(map, uaddr_param, + entry_out, addr_out, hint, sz, align, offset, + -dir, low, high, 0, 0) == 0) + return 0; + + return ENOMEM; +} + +/* + * Kernel allocation bootstrap logic. + */ + +const struct uvm_addr_functions uaddr_kernel_functions = { + .uaddr_select = &uaddr_kbootstrap_select, + .uaddr_destroy = &uaddr_kbootstrap_destroy, + .uaddr_name = "uaddr_kbootstrap" +}; + +/* + * Select an address from the map. + * + * This function ignores the uaddr spec and instead uses the map directly. + * Because of that property, the uaddr algorithm can be shared across all + * kernel maps. + */ +int +uaddr_kbootstrap_select(struct vm_map *map, struct uvm_addr_state *uaddr, + struct vm_map_entry **entry_out, vaddr_t *addr_out, + vsize_t sz, vaddr_t align, vaddr_t offset, vm_prot_t prot, vaddr_t hint) +{ + vaddr_t tmp; + + RB_FOREACH(*entry_out, uvm_map_addr, &map->addr) { + if (VMMAP_FREE_END(*entry_out) <= uvm_maxkaddr && + uvm_addr_fitspace(addr_out, &tmp, + VMMAP_FREE_START(*entry_out), VMMAP_FREE_END(*entry_out), + sz, align, offset, 0, 0) == 0) + return 0; + } + + return ENOMEM; +} + +/* + * Don't destroy the kernel bootstrap allocator. + */ +void +uaddr_kbootstrap_destroy(struct uvm_addr_state *uaddr) +{ + KASSERT(uaddr == (struct uvm_addr_state*)&uaddr_kbootstrap); +} + +/* + * Best fit algorithm. + */ + +const struct uvm_addr_functions uaddr_bestfit_functions = { + .uaddr_select = &uaddr_bestfit_select, + .uaddr_free_insert = &uaddr_bestfit_insert, + .uaddr_free_remove = &uaddr_bestfit_remove, + .uaddr_destroy = &uaddr_bestfit_destroy, + .uaddr_name = "uaddr_bestfit" +}; + +struct uvm_addr_state* +uaddr_bestfit_create(vaddr_t minaddr, vaddr_t maxaddr) +{ + struct uaddr_bestfit_state *uaddr; + + uaddr = pool_get(&uaddr_bestfit_pool, PR_WAITOK); + uaddr->ubf_uaddr.uaddr_minaddr = minaddr; + uaddr->ubf_uaddr.uaddr_maxaddr = maxaddr; + uaddr->ubf_uaddr.uaddr_functions = &uaddr_bestfit_functions; + RB_INIT(&uaddr->ubf_free); + return &uaddr->ubf_uaddr; +} + +void +uaddr_bestfit_destroy(struct uvm_addr_state *uaddr) +{ + pool_put(&uaddr_bestfit_pool, uaddr); +} + +void +uaddr_bestfit_insert(struct vm_map *map, struct uvm_addr_state *uaddr_p, + struct vm_map_entry *entry) +{ + struct uaddr_bestfit_state *uaddr; + struct vm_map_entry *rb_rv; + + uaddr = (struct uaddr_bestfit_state*)uaddr_p; + if ((rb_rv = RB_INSERT(uaddr_free_rbtree, &uaddr->ubf_free, entry)) != + NULL) { + panic("%s: duplicate insertion: state %p " + "interting %p, colliding with %p", __func__, + uaddr, entry, rb_rv); + } +} + +void +uaddr_bestfit_remove(struct vm_map *map, struct uvm_addr_state *uaddr_p, + struct vm_map_entry *entry) +{ + struct uaddr_bestfit_state *uaddr; + + uaddr = (struct uaddr_bestfit_state*)uaddr_p; + if (RB_REMOVE(uaddr_free_rbtree, &uaddr->ubf_free, entry) != entry) + panic("%s: entry was not in tree", __func__); +} + +int +uaddr_bestfit_select(struct vm_map *map, struct uvm_addr_state *uaddr_p, + struct vm_map_entry**entry_out, vaddr_t *addr_out, + vsize_t sz, vaddr_t align, vaddr_t offset, + vm_prot_t prot, vaddr_t hint) +{ + vaddr_t min, max; + struct uaddr_bestfit_state *uaddr; + struct vm_map_entry *entry; + vsize_t guardsz; + + uaddr = (struct uaddr_bestfit_state*)uaddr_p; + guardsz = ((map->flags & VM_MAP_GUARDPAGES) ? PAGE_SIZE : 0); + + /* + * Find smallest item on freelist capable of holding item. + * Deal with guardpages: search for space with one extra page. + */ + entry = uvm_addr_entrybyspace(&uaddr->ubf_free, sz + guardsz); + if (entry == NULL) + return ENOMEM; + + /* + * Walk the tree until we find an entry that fits. + */ + while (uvm_addr_fitspace(&min, &max, + VMMAP_FREE_START(entry), VMMAP_FREE_END(entry), + sz, align, offset, 0, guardsz) != 0) { + entry = RB_NEXT(uaddr_free_rbtree, &uaddr->ubf_free, entry); + if (entry == NULL) + return ENOMEM; + } + + /* + * Return the address that generates the least fragmentation. + */ + *entry_out = entry; + *addr_out = (min - VMMAP_FREE_START(entry) <= + VMMAP_FREE_END(entry) - guardsz - sz - max ? + min : max); + return 0; +} + + +/* + * A userspace allocator based on pivots. + */ + +const struct uvm_addr_functions uaddr_pivot_functions = { + .uaddr_select = &uaddr_pivot_select, + .uaddr_free_insert = &uaddr_pivot_insert, + .uaddr_free_remove = &uaddr_pivot_remove, + .uaddr_destroy = &uaddr_pivot_destroy, +#if defined(DEBUG) || defined(DDB) + .uaddr_print = &uaddr_pivot_print, +#endif /* DEBUG || DDB */ + .uaddr_name = "uaddr_pivot" +}; + +/* + * A special random function for pivots. + * + * This function will return: + * - a random number + * - a multiple of PAGE_SIZE + * - at least PAGE_SIZE + * + * The random function has a slightly higher change to return a small number. + */ +vsize_t +uaddr_pivot_random() +{ + int r; + + /* + * The sum of two six-sided dice will have a normal distribution. + * We map the highest probable number to 1, by folding the curve + * (think of a graph on a piece of paper, that you fold). + * + * Because the fold happens at PIVOT_RND - 1, the numbers 0 and 1 + * have the same and highest probability of happening. + */ + r = arc4random_uniform(PIVOT_RND) + arc4random_uniform(PIVOT_RND) - + (PIVOT_RND - 1); + if (r < 0) + r = -r; + + /* + * Make the returned value at least PAGE_SIZE and a multiple of + * PAGE_SIZE. + */ + return (vaddr_t)(1 + r) << PAGE_SHIFT; +} + +/* + * Select a new pivot. + * + * A pivot must: + * - be chosen random + * - have a randomly chosen gap before it, where the uaddr_state starts + * - have a randomly chosen gap after it, before the uaddr_state ends + * + * Furthermore, the pivot must provide sufficient space for the allocation. + * The addr will be set to the selected address. + * + * Returns ENOMEM on failure. + */ +int +uaddr_pivot_newpivot(struct vm_map *map, struct uaddr_pivot_state *uaddr, + struct uaddr_pivot *pivot, + struct vm_map_entry**entry_out, vaddr_t *addr_out, + vsize_t sz, vaddr_t align, vaddr_t offset, + vsize_t before_gap, vsize_t after_gap) +{ + struct vm_map_entry *entry, *found; + vaddr_t minaddr, maxaddr; + vsize_t dist; + vaddr_t found_minaddr, found_maxaddr; + vaddr_t min, max; + vsize_t arc4_arg; + int fit_error; + u_int32_t path; + + minaddr = uaddr->up_uaddr.uaddr_minaddr; + maxaddr = uaddr->up_uaddr.uaddr_maxaddr; + KASSERT(minaddr < maxaddr); +#ifdef DIAGNOSTIC + if (minaddr + 2 * PAGE_SIZE > maxaddr) { + panic("uaddr_pivot_newpivot: cannot grant random pivot " + "in area less than 2 pages (size = 0x%lx)", + maxaddr - minaddr); + } +#endif /* DIAGNOSTIC */ + + /* + * Gap calculation: 1/32 of the size of the managed area. + * + * At most: sufficient to not get truncated at arc4random. + * At least: 2 PAGE_SIZE + * + * minaddr and maxaddr will be changed according to arc4random. + */ + dist = MAX((maxaddr - minaddr) / 32, 2 * (vaddr_t)PAGE_SIZE); + if (dist >> PAGE_SHIFT > 0xffffffff) { + minaddr += (vsize_t)arc4random() << PAGE_SHIFT; + maxaddr -= (vsize_t)arc4random() << PAGE_SHIFT; + } else { + minaddr += (vsize_t)arc4random_uniform(dist >> PAGE_SHIFT) << + PAGE_SHIFT; + maxaddr -= (vsize_t)arc4random_uniform(dist >> PAGE_SHIFT) << + PAGE_SHIFT; + } + + /* + * A very fast way to find an entry that will be large enough + * to hold the allocation, but still is found more or less + * randomly: the tree path selector has a 50% chance to go for + * a bigger or smaller entry. + * + * Note that the memory may actually be available, + * but the fragmentation may be so bad and the gaps chosen + * so unfortunately, that the allocation will not succeed. + * Or the alignment can only be satisfied by an entry that + * is not visited in the randomly selected path. + * + * This code finds an entry with sufficient space in O(log n) time. + */ + path = arc4random(); + found = NULL; + entry = RB_ROOT(&uaddr->up_free); + while (entry != NULL) { + fit_error = uvm_addr_fitspace(&min, &max, + MAX(VMMAP_FREE_START(entry), minaddr), + MIN(VMMAP_FREE_END(entry), maxaddr), + sz, align, offset, before_gap, after_gap); + + /* It fits, save this entry. */ + if (fit_error == 0) { + found = entry; + found_minaddr = min; + found_maxaddr = max; + } + + /* Next. */ + if (fit_error != 0) + entry = RB_RIGHT(entry, dfree.rbtree); + else if ((path & 0x1) == 0) { + path >>= 1; + entry = RB_RIGHT(entry, dfree.rbtree); + } else { + path >>= 1; + entry = RB_LEFT(entry, dfree.rbtree); + } + } + if (found == NULL) + return ENOMEM; /* Not found a large enough region. */ + + /* + * Calculate a random address within found. + * + * found_minaddr and found_maxaddr are already aligned, so be sure + * to select a multiple of align as the offset in the entry. + * Preferably, arc4random_uniform is used to provide no bias within + * the entry. + * However if the size of the entry exceeds arc4random_uniforms + * argument limit, we simply use arc4random (thus limiting ourselves + * to 4G * PAGE_SIZE bytes offset). + */ + if (found_maxaddr == found_minaddr) + *addr_out = found_minaddr; + else { + KASSERT(align >= PAGE_SIZE && (align & (align - 1)) == 0); + arc4_arg = found_maxaddr - found_minaddr; + if (arc4_arg > 0xffffffff) { + *addr_out = found_minaddr + + (arc4random() & (align - 1)); + } else { + *addr_out = found_minaddr + + (arc4random_uniform(arc4_arg) & (align - 1)); + } + } + /* Address was found in this entry. */ + *entry_out = found; + + /* + * Set up new pivot and return selected address. + * + * Depending on the direction of the pivot, the pivot must be placed + * at the bottom or the top of the allocation: + * - if the pivot moves upwards, place the pivot at the top of the + * allocation, + * - if the pivot moves downwards, place the pivot at the bottom + * of the allocation. + */ + pivot->entry = found; + pivot->dir = (arc4random() & 0x1 ? 1 : -1); + if (pivot->dir > 0) + pivot->addr = *addr_out + sz; + else + pivot->addr = *addr_out; + pivot->expire = PIVOT_EXPIRE - 1; /* First use is right now. */ + return 0; +} + +/* + * Pivot selector. + * + * Each time the selector is invoked, it will select a random pivot, which + * it will use to select memory with. The memory will be placed at the pivot, + * with a randomly sized gap between the allocation and the pivot. + * The pivot will then move so it will never revisit this address. + * + * Each allocation, the pivot expiry timer ticks. Once the pivot becomes + * expired, it will be replaced with a newly created pivot. Pivots also + * automatically expire if they fail to provide memory for an allocation. + * + * Expired pivots are replaced using the uaddr_pivot_newpivot() function, + * which will ensure the pivot points at memory in such a way that the + * allocation will succeed. + * As an added bonus, the uaddr_pivot_newpivot() function will perform the + * allocation immediately and move the pivot as appropriate. + * + * If uaddr_pivot_newpivot() fails to find a new pivot that will allow the + * allocation to succeed, it will not create a new pivot and the allocation + * will fail. + * + * A pivot running into used memory will automatically expire (because it will + * fail to allocate). + * + * Characteristics of the allocator: + * - best case, an allocation is O(log N) + * (it would be O(1), if it werent for the need to check if the memory is + * free; although that can be avoided...) + * - worst case, an allocation is O(log N) + * (the uaddr_pivot_newpivot() function has that complexity) + * - failed allocations always take O(log N) + * (the uaddr_pivot_newpivot() function will walk that deep into the tree). + */ +int +uaddr_pivot_select(struct vm_map *map, struct uvm_addr_state *uaddr_p, + struct vm_map_entry**entry_out, vaddr_t *addr_out, + vsize_t sz, vaddr_t align, vaddr_t offset, + vm_prot_t prot, vaddr_t hint) +{ + struct uaddr_pivot_state *uaddr; + struct vm_map_entry *entry; + struct uaddr_pivot *pivot; + vaddr_t min, max; + vsize_t before_gap, after_gap; + int err; + + /* Hint must be handled by dedicated hint allocator. */ + if (hint != 0) + return EINVAL; + + /* + * Select a random pivot and a random gap sizes around the allocation. + */ + uaddr = (struct uaddr_pivot_state*)uaddr_p; + pivot = &uaddr->up_pivots[ + arc4random_uniform(nitems(uaddr->up_pivots))]; + before_gap = uaddr_pivot_random(); + after_gap = uaddr_pivot_random(); + if (pivot->addr == 0 || pivot->entry == NULL || pivot->expire == 0) + goto expired; /* Pivot is invalid (null or expired). */ + + /* + * Attempt to use the pivot to map the entry. + */ + entry = pivot->entry; + if (pivot->dir > 0) { + if (uvm_addr_fitspace(&min, &max, + MAX(VMMAP_FREE_START(entry), pivot->addr), + VMMAP_FREE_END(entry), sz, align, offset, + before_gap, after_gap) == 0) { + *addr_out = min; + *entry_out = entry; + pivot->addr = min + sz; + pivot->expire--; + return 0; + } + } else { + if (uvm_addr_fitspace(&min, &max, + VMMAP_FREE_START(entry), + MIN(VMMAP_FREE_END(entry), pivot->addr), + sz, align, offset, before_gap, after_gap) == 0) { + *addr_out = max; + *entry_out = entry; + pivot->addr = max; + pivot->expire--; + return 0; + } + } + +expired: + /* + * Pivot expired or allocation failed. + * Use pivot selector to do the allocation and find a new pivot. + */ + err = uaddr_pivot_newpivot(map, uaddr, pivot, entry_out, addr_out, + sz, align, offset, before_gap, after_gap); + return err; +} + +/* + * Free the pivot. + */ +void +uaddr_pivot_destroy(struct uvm_addr_state *uaddr) +{ + pool_put(&uaddr_pivot_pool, uaddr); +} + +/* + * Insert an entry with free space in the space tree. + */ +void +uaddr_pivot_insert(struct vm_map *map, struct uvm_addr_state *uaddr_p, + struct vm_map_entry *entry) +{ + struct uaddr_pivot_state *uaddr; + struct vm_map_entry *rb_rv; + struct uaddr_pivot *p; + vaddr_t check_addr; + vaddr_t start, end; + + uaddr = (struct uaddr_pivot_state*)uaddr_p; + if ((rb_rv = RB_INSERT(uaddr_free_rbtree, &uaddr->up_free, entry)) != + NULL) { + panic("%s: duplicate insertion: state %p " + "inserting entry %p which collides with %p", __func__, + uaddr, entry, rb_rv); + } + + start = VMMAP_FREE_START(entry); + end = VMMAP_FREE_END(entry); + + /* + * Update all pivots that are contained in this entry. + */ + for (p = &uaddr->up_pivots[0]; + p != &uaddr->up_pivots[nitems(uaddr->up_pivots)]; p++) { + check_addr = p->addr; + if (check_addr == 0) + continue; + if (p->dir < 0) + check_addr--; + + if (start <= check_addr && + check_addr < end) { + KASSERT(p->entry == NULL); + p->entry = entry; + } + } +} + +/* + * Remove an entry with free space from the space tree. + */ +void +uaddr_pivot_remove(struct vm_map *map, struct uvm_addr_state *uaddr_p, + struct vm_map_entry *entry) +{ + struct uaddr_pivot_state *uaddr; + struct uaddr_pivot *p; + + uaddr = (struct uaddr_pivot_state*)uaddr_p; + if (RB_REMOVE(uaddr_free_rbtree, &uaddr->up_free, entry) != entry) + panic("%s: entry was not in tree", __func__); + + /* + * Inform any pivot with this entry that the entry is gone. + * Note that this does not automatically invalidate the pivot. + */ + for (p = &uaddr->up_pivots[0]; + p != &uaddr->up_pivots[nitems(uaddr->up_pivots)]; p++) { + if (p->entry == entry) + p->entry = NULL; + } +} + +/* + * Create a new pivot selector. + * + * Initially, all pivots are in the expired state. + * Two reasons for this: + * - it means this allocator will not take a huge amount of time + * - pivots select better on demand, because the pivot selection will be + * affected by preceding allocations: + * the next pivots will likely end up in different segments of free memory, + * that was segmented by an earlier allocation; better spread. + */ +struct uvm_addr_state* +uaddr_pivot_create(vaddr_t minaddr, vaddr_t maxaddr) +{ + struct uaddr_pivot_state *uaddr; + + uaddr = pool_get(&uaddr_pivot_pool, PR_WAITOK); + uaddr->up_uaddr.uaddr_minaddr = minaddr; + uaddr->up_uaddr.uaddr_maxaddr = maxaddr; + uaddr->up_uaddr.uaddr_functions = &uaddr_pivot_functions; + RB_INIT(&uaddr->up_free); + bzero(uaddr->up_pivots, sizeof(uaddr->up_pivots)); + + return &uaddr->up_uaddr; +} + +#if defined(DEBUG) || defined(DDB) +/* + * Print the uaddr_pivot_state. + * + * If full, a listing of all entries in the state will be provided. + */ +void +uaddr_pivot_print(struct uvm_addr_state *uaddr_p, boolean_t full, + int (*pr)(const char*, ...)) +{ + struct uaddr_pivot_state *uaddr; + struct uaddr_pivot *pivot; + struct vm_map_entry *entry; + int i; + vaddr_t check_addr; + + uaddr = (struct uaddr_pivot_state*)uaddr_p; + + for (i = 0; i < NUM_PIVOTS; i++) { + pivot = &uaddr->up_pivots[i]; + + (*pr)("\tpivot 0x%lx, epires in %d, direction %d\n", + pivot->addr, pivot->expire, pivot->dir); + } + if (!full) + return; + + if (RB_EMPTY(&uaddr->up_free)) + (*pr)("\tempty\n"); + /* Print list of free space. */ + RB_FOREACH(entry, uaddr_free_rbtree, &uaddr->up_free) { + (*pr)("\t0x%lx - 0x%lx free (0x%lx bytes)\n", + VMMAP_FREE_START(entry), VMMAP_FREE_END(entry), + VMMAP_FREE_END(entry) - VMMAP_FREE_START(entry)); + + for (i = 0; i < NUM_PIVOTS; i++) { + pivot = &uaddr->up_pivots[i]; + check_addr = pivot->addr; + if (check_addr == 0) + continue; + if (pivot->dir < 0) + check_addr--; + + if (VMMAP_FREE_START(entry) <= check_addr && + check_addr < VMMAP_FREE_END(entry)) { + (*pr)("\t\tcontains pivot %d (0x%lx)\n", + i, pivot->addr); + } + } + } +} +#endif /* DEBUG || DDB */ + +/* + * Strategy for uaddr_stack_brk_select. + */ +struct uaddr_bs_strat { + vaddr_t start; /* Start of area. */ + vaddr_t end; /* End of area. */ + int dir; /* Search direction. */ +}; + +/* + * Stack/break allocator. + * + * Stack area is grown into in the opposite direction of the stack growth, + * brk area is grown downward (because sbrk() grows upward). + * + * Both areas are grown into proportially: a weighted chance is used to + * select which one (stack or brk area) to try. If the allocation fails, + * the other one is tested. + */ + +const struct uvm_addr_functions uaddr_stack_brk_functions = { + .uaddr_select = &uaddr_stack_brk_select, + .uaddr_destroy = &uaddr_destroy, + .uaddr_name = "uaddr_stckbrk" +}; + +/* + * Stack/brk address selector. + */ +int +uaddr_stack_brk_select(struct vm_map *map, struct uvm_addr_state *uaddr, + struct vm_map_entry**entry_out, vaddr_t *addr_out, + vsize_t sz, vaddr_t align, vaddr_t offset, + vm_prot_t prot, vaddr_t hint) +{ + vsize_t before_gap, after_gap; + int stack_idx, brk_idx; + struct uaddr_bs_strat strat[2], *s; + vsize_t sb_size; + + /* + * Choose gap size and if the stack is searched before or after the + * brk area. + */ + before_gap = ((arc4random() & 0x3) + 1) << PAGE_SHIFT; + after_gap = ((arc4random() & 0x3) + 1) << PAGE_SHIFT; + + sb_size = (map->s_end - map->s_start) + (map->b_end - map->b_start); + sb_size >>= PAGE_SHIFT; + if (arc4random_uniform(MAX(sb_size, 0xffffffff)) > + map->b_end - map->b_start) { + brk_idx = 1; + stack_idx = 0; + } else { + brk_idx = 0; + stack_idx = 1; + } + + /* + * Set up stack search strategy. + */ + s = &strat[stack_idx]; + s->start = MAX(map->s_start, uaddr->uaddr_minaddr); + s->end = MIN(map->s_end, uaddr->uaddr_maxaddr); +#ifdef MACHINE_STACK_GROWS_UP + s->dir = -1; +#else + s->dir = 1; +#endif + + /* + * Set up brk search strategy. + */ + s = &strat[brk_idx]; + s->start = MAX(map->b_start, uaddr->uaddr_minaddr); + s->end = MIN(map->b_end, uaddr->uaddr_maxaddr); + s->dir = -1; /* Opposite of brk() growth. */ + + /* + * Linear search for space. + */ + for (s = &strat[0]; s < &strat[nitems(strat)]; s++) { + if (s->end - s->start < sz) + continue; + if (uvm_addr_linsearch(map, uaddr, entry_out, addr_out, + 0, sz, align, offset, s->dir, s->start, s->end - sz, + before_gap, after_gap) == 0) + return 0; + } + + return ENOMEM; +} + +struct uvm_addr_state* +uaddr_stack_brk_create(vaddr_t minaddr, vaddr_t maxaddr) +{ + struct uvm_addr_state* uaddr; + + uaddr = pool_get(&uaddr_pool, PR_WAITOK); + uaddr->uaddr_minaddr = minaddr; + uaddr->uaddr_maxaddr = maxaddr; + uaddr->uaddr_functions = &uaddr_stack_brk_functions; + return uaddr; +} + + +RB_GENERATE(uaddr_free_rbtree, vm_map_entry, dfree.rbtree, + uvm_mapent_fspace_cmp); diff --git a/sys/uvm/uvm_addr.h b/sys/uvm/uvm_addr.h new file mode 100644 index 00000000000..5d94947d5a3 --- /dev/null +++ b/sys/uvm/uvm_addr.h @@ -0,0 +1,116 @@ +/* $OpenBSD: uvm_addr.h,v 1.1 2012/03/09 13:01:29 ariane Exp $ */ + +/* + * Copyright (c) 2011 Ariane van der Steldt + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef _UVM_UVM_ADDR_H_ +#define _UVM_UVM_ADDR_H_ + +#include + +/* + * Address selection logic. + * + * Address selection is just that: selection. These functions may make no + * changes to the map, except for their own state (which is passed as a + * uaddr_state pointer). + */ + + +/* + * UVM address selection base state. + * + * Each uvm address algorithm requires these parameters: + * - lower bound address (page aligned) + * - upper bound address (page aligned) + * - function address pointers + */ +struct uvm_addr_state { + vaddr_t uaddr_minaddr; + vaddr_t uaddr_maxaddr; + const struct uvm_addr_functions *uaddr_functions; +}; + +/* + * This structure describes one algorithm implementation. + * + * Each algorithm is described in terms of: + * - uaddr_select: an address selection algorithm + * - uaddr_free_insert: a freelist insertion function (optional) + * - uaddr_free_remove: a freelist deletion function (optional) + * - uaddr_destroy: a destructor for the algorithm state + */ +struct uvm_addr_functions { + int (*uaddr_select)(struct vm_map *map, + struct uvm_addr_state *uaddr, + struct vm_map_entry**entry_out, vaddr_t *addr_out, + vsize_t sz, vaddr_t align, vaddr_t offset, + vm_prot_t prot, vaddr_t hint); + void (*uaddr_free_insert)(struct vm_map *map, + struct uvm_addr_state *uaddr_state, + struct vm_map_entry *entry); + void (*uaddr_free_remove)(struct vm_map *map, + struct uvm_addr_state *uaddr_state, + struct vm_map_entry *entry); + void (*uaddr_destroy)(struct uvm_addr_state *uaddr_state); + void (*uaddr_print)(struct uvm_addr_state *uaddr_state, boolean_t full, + int (*pr)(const char*, ...)); + + const char* uaddr_name; /* Name of the allocator. */ +}; + + +#ifdef _KERNEL + +void uvm_addr_init(void); +void uvm_addr_destroy(struct uvm_addr_state*); +vaddr_t uvm_addr_align(vaddr_t, vaddr_t, vaddr_t); +vaddr_t uvm_addr_align_back(vaddr_t, vaddr_t, vaddr_t); +int uvm_addr_linsearch(struct vm_map*, + struct uvm_addr_state*, struct vm_map_entry**, + vaddr_t *addr_out, vaddr_t, vsize_t, + vaddr_t, vaddr_t, int, vaddr_t, vaddr_t, + vsize_t, vsize_t); +int uvm_addr_invoke(struct vm_map*, + struct uvm_addr_state*, struct vm_map_entry**, + struct vm_map_entry**, vaddr_t*, + vsize_t, vaddr_t, vaddr_t, vm_prot_t, vaddr_t); +struct uvm_addr_state *uaddr_lin_create(vaddr_t, vaddr_t); +struct uvm_addr_state *uaddr_rnd_create(vaddr_t, vaddr_t); +struct uvm_addr_state *uaddr_hint_create(vaddr_t, vaddr_t, vsize_t); +struct uvm_addr_state *uaddr_bestfit_create(vaddr_t, vaddr_t); +struct uvm_addr_state *uaddr_pivot_create(vaddr_t, vaddr_t); +struct uvm_addr_state *uaddr_stack_brk_create(vaddr_t, vaddr_t); +int uvm_addr_fitspace(vaddr_t*, vaddr_t*, + vaddr_t, vaddr_t, vsize_t, vaddr_t, vaddr_t, + vsize_t, vsize_t); + +#if defined(DEBUG) || defined(DDB) +void uvm_addr_print(struct uvm_addr_state*, const char*, + boolean_t, int (*pr)(const char*, ...)); +#endif /* DEBUG || DDB */ + +/* + * Kernel bootstrap allocator. + */ +RB_HEAD(uaddr_free_rbtree, vm_map_entry); +RB_PROTOTYPE(uaddr_free_rbtree, vm_map_entry, dfree.rbtree, + uvm_mapent_fspace_cmp); + +extern struct uvm_addr_state uaddr_kbootstrap; + +#endif /* _KERNEL */ +#endif /* _UVM_UVM_ADDR_H_ */ diff --git a/sys/uvm/uvm_extern.h b/sys/uvm/uvm_extern.h index 201abdb923a..991a44d4776 100644 --- a/sys/uvm/uvm_extern.h +++ b/sys/uvm/uvm_extern.h @@ -1,4 +1,4 @@ -/* $OpenBSD: uvm_extern.h,v 1.103 2011/07/08 00:10:59 tedu Exp $ */ +/* $OpenBSD: uvm_extern.h,v 1.104 2012/03/09 13:01:29 ariane Exp $ */ /* $NetBSD: uvm_extern.h,v 1.57 2001/03/09 01:02:12 chs Exp $ */ /* @@ -185,6 +185,7 @@ typedef int vm_prot_t; #define UVM_FLAG_AMAPPAD 0x100000 /* for bss: pad amap to reduce malloc() */ #define UVM_FLAG_TRYLOCK 0x200000 /* fail if we can not lock map */ #define UVM_FLAG_HOLE 0x400000 /* no backend */ +#define UVM_FLAG_QUERY 0x800000 /* do everything, except actual execution */ /* macros to extract info */ #define UVM_PROTECTION(X) ((X) & UVM_PROT_MASK) @@ -644,10 +645,9 @@ void km_free(void *, size_t, const struct kmem_va_mode *, const struct kmem_pa_mode *); /* uvm_map.c */ -#define uvm_map(_m, _a, _sz, _u, _f, _al, _fl) uvm_map_p(_m, _a, _sz, _u, _f, _al, _fl, 0) -int uvm_map_p(vm_map_t, vaddr_t *, vsize_t, +int uvm_map(vm_map_t, vaddr_t *, vsize_t, struct uvm_object *, voff_t, vsize_t, - uvm_flag_t, struct proc *); + uvm_flag_t); int uvm_map_pageable(vm_map_t, vaddr_t, vaddr_t, boolean_t, int); int uvm_map_pageable_all(vm_map_t, int, vsize_t); diff --git a/sys/uvm/uvm_fault.c b/sys/uvm/uvm_fault.c index b699bba34c5..03a4418dac6 100644 --- a/sys/uvm/uvm_fault.c +++ b/sys/uvm/uvm_fault.c @@ -1,4 +1,4 @@ -/* $OpenBSD: uvm_fault.c,v 1.62 2011/07/03 18:34:14 oga Exp $ */ +/* $OpenBSD: uvm_fault.c,v 1.63 2012/03/09 13:01:29 ariane Exp $ */ /* $NetBSD: uvm_fault.c,v 1.51 2000/08/06 00:22:53 thorpej Exp $ */ /* @@ -1701,7 +1701,7 @@ uvm_fault_unwire(vm_map_t map, vaddr_t start, vaddr_t end) void uvm_fault_unwire_locked(vm_map_t map, vaddr_t start, vaddr_t end) { - vm_map_entry_t entry; + vm_map_entry_t entry, next; pmap_t pmap = vm_map_pmap(map); vaddr_t va; paddr_t pa; @@ -1734,9 +1734,9 @@ uvm_fault_unwire_locked(vm_map_t map, vaddr_t start, vaddr_t end) */ KASSERT(va >= entry->start); while (va >= entry->end) { - KASSERT(entry->next != &map->header && - entry->next->start <= entry->end); - entry = entry->next; + next = RB_NEXT(uvm_map_addr, &map->addr, entry); + KASSERT(next != NULL && next->start <= entry->end); + entry = next; } /* @@ -1825,6 +1825,9 @@ uvmfault_lookup(struct uvm_faultinfo *ufi, boolean_t write_lock) */ while (1) { + if (ufi->orig_rvaddr < ufi->map->min_offset || + ufi->orig_rvaddr >= ufi->map->max_offset) + return(FALSE); /* * lock map @@ -1839,7 +1842,7 @@ uvmfault_lookup(struct uvm_faultinfo *ufi, boolean_t write_lock) * lookup */ if (!uvm_map_lookup_entry(ufi->map, ufi->orig_rvaddr, - &ufi->entry)) { + &ufi->entry)) { uvmfault_unlockmaps(ufi, write_lock); return(FALSE); } diff --git a/sys/uvm/uvm_init.c b/sys/uvm/uvm_init.c index fce559d83e5..81110d054e8 100644 --- a/sys/uvm/uvm_init.c +++ b/sys/uvm/uvm_init.c @@ -1,4 +1,4 @@ -/* $OpenBSD: uvm_init.c,v 1.28 2010/08/07 03:50:02 krw Exp $ */ +/* $OpenBSD: uvm_init.c,v 1.29 2012/03/09 13:01:29 ariane Exp $ */ /* $NetBSD: uvm_init.c,v 1.14 2000/06/27 17:29:23 mrg Exp $ */ /* @@ -51,6 +51,7 @@ #include #include +#include /* * struct uvm: we store all global vars in this structure to make them @@ -177,4 +178,15 @@ uvm_init(void) * init anonymous memory systems */ uvm_anon_init(); + + /* + * Switch kernel and kmem_map over to a best-fit allocator, + * instead of walking the tree. + */ + uvm_map_set_uaddr(kernel_map, &kernel_map->uaddr_any[3], + uaddr_bestfit_create(vm_map_min(kernel_map), + vm_map_max(kernel_map))); + uvm_map_set_uaddr(kmem_map, &kmem_map->uaddr_any[3], + uaddr_bestfit_create(vm_map_min(kmem_map), + vm_map_max(kmem_map))); } diff --git a/sys/uvm/uvm_io.c b/sys/uvm/uvm_io.c index 876b5420b6f..bfeea500ace 100644 --- a/sys/uvm/uvm_io.c +++ b/sys/uvm/uvm_io.c @@ -1,4 +1,4 @@ -/* $OpenBSD: uvm_io.c,v 1.19 2011/06/06 17:10:23 ariane Exp $ */ +/* $OpenBSD: uvm_io.c,v 1.20 2012/03/09 13:01:29 ariane Exp $ */ /* $NetBSD: uvm_io.c,v 1.12 2000/06/27 17:29:23 mrg Exp $ */ /* @@ -64,7 +64,7 @@ uvm_io(vm_map_t map, struct uio *uio, int flags) { vaddr_t baseva, endva, pageoffset, kva; vsize_t chunksz, togo, sz; - vm_map_entry_t dead_entries; + struct uvm_map_deadq dead_entries; int error, extractflags; /* @@ -93,7 +93,7 @@ uvm_io(vm_map_t map, struct uio *uio, int flags) chunksz = min(round_page(togo + pageoffset), MAXBSIZE); error = 0; - extractflags = UVM_EXTRACT_QREF | UVM_EXTRACT_CONTIG; + extractflags = 0; if (flags & UVM_IO_FIXPROT) extractflags |= UVM_EXTRACT_FIXPROT; @@ -107,7 +107,7 @@ uvm_io(vm_map_t map, struct uio *uio, int flags) * step 2: extract mappings from the map into kernel_map */ - error = uvm_map_extract(map, baseva, chunksz, kernel_map, &kva, + error = uvm_map_extract(map, baseva, chunksz, &kva, extractflags); if (error) { @@ -139,12 +139,11 @@ uvm_io(vm_map_t map, struct uio *uio, int flags) */ vm_map_lock(kernel_map); + TAILQ_INIT(&dead_entries); uvm_unmap_remove(kernel_map, kva, kva+chunksz, - &dead_entries, NULL, FALSE); + &dead_entries, FALSE, TRUE); vm_map_unlock(kernel_map); - - if (dead_entries != NULL) - uvm_unmap_detach(dead_entries, AMAP_REFALL); + uvm_unmap_detach(&dead_entries, AMAP_REFALL); /* * We defer checking the error return from uiomove until diff --git a/sys/uvm/uvm_km.c b/sys/uvm/uvm_km.c index da5686d0881..aa97110d6bf 100644 --- a/sys/uvm/uvm_km.c +++ b/sys/uvm/uvm_km.c @@ -1,4 +1,4 @@ -/* $OpenBSD: uvm_km.c,v 1.106 2011/07/03 18:34:14 oga Exp $ */ +/* $OpenBSD: uvm_km.c,v 1.107 2012/03/09 13:01:29 ariane Exp $ */ /* $NetBSD: uvm_km.c,v 1.42 2001/01/14 02:10:01 thorpej Exp $ */ /* @@ -138,7 +138,6 @@ #include #include #include - #include /* @@ -184,7 +183,13 @@ uvm_km_init(vaddr_t start, vaddr_t end) * before installing. */ - uvm_map_setup(&kernel_map_store, base, end, VM_MAP_PAGEABLE); + uvm_map_setup(&kernel_map_store, base, end, +#ifdef KVA_GUARDPAGES + VM_MAP_PAGEABLE | VM_MAP_GUARDPAGES +#else + VM_MAP_PAGEABLE +#endif + ); kernel_map_store.pmap = pmap_kernel(); if (base != start && uvm_map(&kernel_map_store, &base, start - base, NULL, UVM_UNKNOWN_OFFSET, 0, UVM_MAPFLAG(UVM_PROT_ALL, UVM_PROT_ALL, @@ -464,16 +469,16 @@ uvm_km_free(struct vm_map *map, vaddr_t addr, vsize_t size) void uvm_km_free_wakeup(struct vm_map *map, vaddr_t addr, vsize_t size) { - struct vm_map_entry *dead_entries; + struct uvm_map_deadq dead_entries; vm_map_lock(map); + TAILQ_INIT(&dead_entries); uvm_unmap_remove(map, trunc_page(addr), round_page(addr+size), - &dead_entries, NULL, FALSE); + &dead_entries, FALSE, TRUE); wakeup(map); vm_map_unlock(map); - if (dead_entries != NULL) - uvm_unmap_detach(dead_entries, 0); + uvm_unmap_detach(&dead_entries, 0); } /* @@ -692,8 +697,10 @@ struct uvm_km_free_page *uvm_km_doputpage(struct uvm_km_free_page *); void uvm_km_page_init(void) { - int lowat_min; - int i; + int lowat_min; + int i; + int len, bulk; + vaddr_t addr; mtx_init(&uvm_km_pages.mtx, IPL_VM); if (!uvm_km_pages.lowat) { @@ -709,14 +716,27 @@ uvm_km_page_init(void) if (uvm_km_pages.hiwat > UVM_KM_PAGES_HIWAT_MAX) uvm_km_pages.hiwat = UVM_KM_PAGES_HIWAT_MAX; - for (i = 0; i < uvm_km_pages.hiwat; i++) { - uvm_km_pages.page[i] = (vaddr_t)uvm_km_kmemalloc(kernel_map, - NULL, PAGE_SIZE, UVM_KMF_NOWAIT|UVM_KMF_VALLOC); - if (uvm_km_pages.page[i] == 0) - break; + /* Allocate all pages in as few allocations as possible. */ + len = 0; + bulk = uvm_km_pages.hiwat; + while (len < uvm_km_pages.hiwat && bulk > 0) { + bulk = MIN(bulk, uvm_km_pages.hiwat - len); + addr = vm_map_min(kernel_map); + if (uvm_map(kernel_map, &addr, (vsize_t)bulk << PAGE_SHIFT, + NULL, UVM_UNKNOWN_OFFSET, 0, + UVM_MAPFLAG(UVM_PROT_RW, UVM_PROT_RW, UVM_INH_NONE, + UVM_ADV_RANDOM, UVM_KMF_TRYLOCK)) != 0) { + bulk /= 2; + continue; + } + + for (i = len; i < len + bulk; i++, addr += PAGE_SIZE) + uvm_km_pages.page[i] = addr; + len += bulk; } - uvm_km_pages.free = i; - for ( ; i < UVM_KM_PAGES_HIWAT_MAX; i++) + + uvm_km_pages.free = len; + for (i = len; i < UVM_KM_PAGES_HIWAT_MAX; i++) uvm_km_pages.page[i] = 0; /* tone down if really high */ @@ -760,17 +780,25 @@ uvm_km_thread(void *arg) mtx_leave(&uvm_km_pages.mtx); if (allocmore) { + bzero(pg, sizeof(pg)); for (i = 0; i < nitems(pg); i++) { - pg[i] = (vaddr_t)uvm_km_kmemalloc(kernel_map, - NULL, PAGE_SIZE, UVM_KMF_VALLOC); + pg[i] = vm_map_min(kernel_map); + if (uvm_map(kernel_map, &pg[i], PAGE_SIZE, + NULL, UVM_UNKNOWN_OFFSET, 0, + UVM_MAPFLAG(UVM_PROT_RW, UVM_PROT_RW, + UVM_INH_NONE, UVM_ADV_RANDOM, + UVM_KMF_TRYLOCK)) != 0) { + pg[i] = 0; + break; + } } - + mtx_enter(&uvm_km_pages.mtx); for (i = 0; i < nitems(pg); i++) { if (uvm_km_pages.free == nitems(uvm_km_pages.page)) break; - else + else if (pg[i] != 0) uvm_km_pages.page[uvm_km_pages.free++] = pg[i]; } @@ -778,8 +806,12 @@ uvm_km_thread(void *arg) mtx_leave(&uvm_km_pages.mtx); /* Cleanup left-over pages (if any). */ - for (; i < nitems(pg); i++) - uvm_km_free(kernel_map, pg[i], PAGE_SIZE); + for (; i < nitems(pg); i++) { + if (pg[i] != 0) { + uvm_unmap(kernel_map, + pg[i], pg[i] + PAGE_SIZE); + } + } } while (fp) { fp = uvm_km_doputpage(fp); @@ -808,7 +840,7 @@ uvm_km_doputpage(struct uvm_km_free_page *fp) mtx_leave(&uvm_km_pages.mtx); if (freeva) - uvm_km_free(kernel_map, va, PAGE_SIZE); + uvm_unmap(kernel_map, va, va + PAGE_SIZE); uvm_pagefree(pg); return (nextfp); diff --git a/sys/uvm/uvm_map.c b/sys/uvm/uvm_map.c index e097952a130..bc6b9df0281 100644 --- a/sys/uvm/uvm_map.c +++ b/sys/uvm/uvm_map.c @@ -1,7 +1,22 @@ -/* $OpenBSD: uvm_map.c,v 1.147 2011/11/24 18:47:34 guenther Exp $ */ +/* $OpenBSD: uvm_map.c,v 1.148 2012/03/09 13:01:29 ariane Exp $ */ /* $NetBSD: uvm_map.c,v 1.86 2000/11/27 08:40:03 chs Exp $ */ -/* +/* + * Copyright (c) 2011 Ariane van der Steldt + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * * Copyright (c) 1997 Charles D. Cranor and Washington University. * Copyright (c) 1991, 1993, The Regents of the University of California. * @@ -71,6 +86,9 @@ * uvm_map.c: uvm map operations */ +/* #define DEBUG */ +/* #define VMMAP_DEBUG */ + #include #include #include @@ -86,662 +104,822 @@ #endif #include -#undef RB_AUGMENT -#define RB_AUGMENT(x) uvm_rb_augment(x) #ifdef DDB #include #endif -static struct timeval uvm_kmapent_last_warn_time; -static struct timeval uvm_kmapent_warn_rate = { 10, 0 }; - -const char vmmapbsy[] = "vmmapbsy"; +#include + + +vsize_t uvmspace_dused(struct vm_map*, vaddr_t, vaddr_t); +int uvm_mapent_isjoinable(struct vm_map*, + struct vm_map_entry*, struct vm_map_entry*); +struct vm_map_entry *uvm_mapent_merge(struct vm_map*, struct vm_map_entry*, + struct vm_map_entry*, struct uvm_map_deadq*); +struct vm_map_entry *uvm_mapent_tryjoin(struct vm_map*, + struct vm_map_entry*, struct uvm_map_deadq*); +struct vm_map_entry *uvm_map_mkentry(struct vm_map*, struct vm_map_entry*, + struct vm_map_entry*, vaddr_t, vsize_t, int, + struct uvm_map_deadq*); +struct vm_map_entry *uvm_mapent_alloc(struct vm_map*, int); +void uvm_mapent_free(struct vm_map_entry*); +void uvm_unmap_kill_entry(struct vm_map*, + struct vm_map_entry*); +void uvm_mapent_mkfree(struct vm_map*, + struct vm_map_entry*, struct vm_map_entry**, + struct uvm_map_deadq*, boolean_t); +void uvm_map_pageable_pgon(struct vm_map*, + struct vm_map_entry*, struct vm_map_entry*, + vaddr_t, vaddr_t); +int uvm_map_pageable_wire(struct vm_map*, + struct vm_map_entry*, struct vm_map_entry*, + vaddr_t, vaddr_t, int); +void uvm_map_setup_entries(struct vm_map*); +void uvm_map_teardown(struct vm_map*); +void uvm_map_vmspace_update(struct vm_map*, + struct uvm_map_deadq*, int); +void uvm_map_kmem_grow(struct vm_map*, + struct uvm_map_deadq*, vsize_t, int); +void uvm_map_freelist_update_clear(struct vm_map*, + struct uvm_map_deadq*); +void uvm_map_freelist_update_refill(struct vm_map *, int); +void uvm_map_freelist_update(struct vm_map*, + struct uvm_map_deadq*, vaddr_t, vaddr_t, + vaddr_t, vaddr_t, int); +struct vm_map_entry *uvm_map_fix_space(struct vm_map*, struct vm_map_entry*, + vaddr_t, vaddr_t, int); +int uvm_map_sel_limits(vaddr_t*, vaddr_t*, vsize_t, int, + struct vm_map_entry*, vaddr_t, vaddr_t, vaddr_t, + int); +int uvm_map_findspace(struct vm_map*, + struct vm_map_entry**, struct vm_map_entry**, + vaddr_t*, vsize_t, vaddr_t, vaddr_t, vm_prot_t, + vaddr_t); /* - * pool for vmspace structures. + * Tree management functions. */ -struct pool uvm_vmspace_pool; +static __inline void uvm_mapent_copy(struct vm_map_entry*, + struct vm_map_entry*); +static int uvm_mapentry_addrcmp(struct vm_map_entry*, + struct vm_map_entry*); +static int uvm_mapentry_freecmp(struct vm_map_entry*, + struct vm_map_entry*); +void uvm_mapent_free_insert(struct vm_map*, + struct uvm_addr_state*, struct vm_map_entry*); +void uvm_mapent_free_remove(struct vm_map*, + struct uvm_addr_state*, struct vm_map_entry*); +void uvm_mapent_addr_insert(struct vm_map*, + struct vm_map_entry*); +void uvm_mapent_addr_remove(struct vm_map*, + struct vm_map_entry*); +void uvm_map_splitentry(struct vm_map*, + struct vm_map_entry*, struct vm_map_entry*, + vaddr_t); +vsize_t uvm_map_boundary(struct vm_map*, vaddr_t, vaddr_t); +int uvm_mapent_bias(struct vm_map*, struct vm_map_entry*); /* - * pool for dynamically-allocated map entries. + * uvm_vmspace_fork helper functions. */ +struct vm_map_entry *uvm_mapent_clone(struct vm_map*, vaddr_t, vsize_t, + vsize_t, struct vm_map_entry*, + struct uvm_map_deadq*, int, int); +void uvm_mapent_forkshared(struct vmspace*, struct vm_map*, + struct vm_map*, struct vm_map_entry*, + struct uvm_map_deadq*); +void uvm_mapent_forkcopy(struct vmspace*, struct vm_map*, + struct vm_map*, struct vm_map_entry*, + struct uvm_map_deadq*); -struct pool uvm_map_entry_pool; -struct pool uvm_map_entry_kmem_pool; - -#ifdef PMAP_GROWKERNEL /* - * This global represents the end of the kernel virtual address - * space. If we want to exceed this, we must grow the kernel - * virtual address space dynamically. - * - * Note, this variable is locked by kernel_map's lock. + * Tree validation. */ -vaddr_t uvm_maxkaddr; + +#ifdef VMMAP_DEBUG +void uvm_tree_assert(struct vm_map*, int, char*, + char*, int); +#define UVM_ASSERT(map, cond, file, line) \ + uvm_tree_assert((map), (cond), #cond, (file), (line)) +void uvm_tree_sanity(struct vm_map*, char*, int); +void uvm_tree_size_chk(struct vm_map*, char*, int); +void vmspace_validate(struct vm_map*); +#else +#define uvm_tree_sanity(_map, _file, _line) do {} while (0) +#define uvm_tree_size_chk(_map, _file, _line) do {} while (0) +#define vmspace_validate(_map) do {} while (0) #endif /* - * macros + * All architectures will have pmap_prefer. */ +#ifndef PMAP_PREFER +#define PMAP_PREFER_ALIGN() (vaddr_t)PAGE_SIZE +#define PMAP_PREFER_OFFSET(off) 0 +#define PMAP_PREFER(addr, off) (addr) +#endif + /* - * uvm_map_entry_link: insert entry into a map + * The kernel map will initially be VM_MAP_KSIZE_INIT bytes. + * Every time that gets cramped, we grow by at least VM_MAP_KSIZE_DELTA bytes. * - * => map must be locked + * We attempt to grow by UVM_MAP_KSIZE_ALLOCMUL times the allocation size + * each time. */ -#define uvm_map_entry_link(map, after_where, entry) do { \ - (map)->nentries++; \ - (entry)->prev = (after_where); \ - (entry)->next = (after_where)->next; \ - (entry)->prev->next = (entry); \ - (entry)->next->prev = (entry); \ - uvm_rb_insert(map, entry); \ -} while (0) - +#define VM_MAP_KSIZE_INIT (512 * (vaddr_t)PAGE_SIZE) +#define VM_MAP_KSIZE_DELTA (256 * (vaddr_t)PAGE_SIZE) +#define VM_MAP_KSIZE_ALLOCMUL 4 /* - * uvm_map_entry_unlink: remove entry from a map - * - * => map must be locked + * When selecting a random free-space block, look at most FSPACE_DELTA blocks + * ahead. */ -#define uvm_map_entry_unlink(map, entry) do { \ - (map)->nentries--; \ - (entry)->next->prev = (entry)->prev; \ - (entry)->prev->next = (entry)->next; \ - uvm_rb_remove(map, entry); \ -} while (0) - +#define FSPACE_DELTA 8 /* - * SAVE_HINT: saves the specified entry as the hint for future lookups. + * Put allocations adjecent to previous allocations when the free-space tree + * is larger than FSPACE_COMPACT entries. * - * => map need not be locked (protected by hint_lock). + * Alignment and PMAP_PREFER may still cause the entry to not be fully + * adjecent. Note that this strategy reduces memory fragmentation (by leaving + * a large space before or after the allocation). */ -#define SAVE_HINT(map,check,value) do { \ - simple_lock(&(map)->hint_lock); \ - if ((map)->hint == (check)) \ - (map)->hint = (value); \ - simple_unlock(&(map)->hint_lock); \ -} while (0) - +#define FSPACE_COMPACT 128 /* - * VM_MAP_RANGE_CHECK: check and correct range + * Make the address selection skip at most this many bytes from the start of + * the free space in which the allocation takes place. * - * => map must at least be read locked + * The main idea behind a randomized address space is that an attacker cannot + * know where to target his attack. Therefore, the location of objects must be + * as random as possible. However, the goal is not to create the most sparse + * map that is possible. + * FSPACE_MAXOFF pushes the considered range in bytes down to less insane + * sizes, thereby reducing the sparseness. The biggest randomization comes + * from fragmentation, i.e. FSPACE_COMPACT. */ - -#define VM_MAP_RANGE_CHECK(map, start, end) do { \ - if (start < vm_map_min(map)) \ - start = vm_map_min(map); \ - if (end > vm_map_max(map)) \ - end = vm_map_max(map); \ - if (start > end) \ - start = end; \ -} while (0) - +#define FSPACE_MAXOFF ((vaddr_t)32 * 1024 * 1024) /* - * local prototypes + * Allow for small gaps in the overflow areas. + * Gap size is in bytes and does not have to be a multiple of page-size. */ +#define FSPACE_BIASGAP ((vaddr_t)32 * 1024) -void uvm_mapent_copy(struct vm_map_entry *, struct vm_map_entry *); -void uvm_map_entry_unwire(struct vm_map *, struct vm_map_entry *); -void uvm_map_reference_amap(struct vm_map_entry *, int); -void uvm_map_unreference_amap(struct vm_map_entry *, int); -int uvm_map_spacefits(struct vm_map *, vaddr_t *, vsize_t, - struct vm_map_entry *, voff_t, vsize_t); +/* auto-allocate address lower bound */ +#define VMMAP_MIN_ADDR PAGE_SIZE -struct vm_map_entry *uvm_mapent_alloc(struct vm_map *, int); -void uvm_mapent_free(struct vm_map_entry *); -#ifdef KVA_GUARDPAGES -/* - * Number of kva guardpages in use. - */ -int kva_guardpages; +#ifdef DEADBEEF0 +#define UVMMAP_DEADBEEF ((void*)DEADBEEF0) +#else +#define UVMMAP_DEADBEEF ((void*)0xdeadd0d0) #endif - -/* - * Tree manipulation. - */ -void uvm_rb_insert(struct vm_map *, struct vm_map_entry *); -void uvm_rb_remove(struct vm_map *, struct vm_map_entry *); -vsize_t uvm_rb_space(struct vm_map *, struct vm_map_entry *); - #ifdef DEBUG -int _uvm_tree_sanity(struct vm_map *map, const char *name); -#endif -vsize_t uvm_rb_subtree_space(struct vm_map_entry *); -void uvm_rb_fixup(struct vm_map *, struct vm_map_entry *); - -static __inline int -uvm_compare(struct vm_map_entry *a, struct vm_map_entry *b) -{ - if (a->start < b->start) - return (-1); - else if (a->start > b->start) - return (1); - - return (0); -} +int uvm_map_printlocks = 0; +#define LPRINTF(_args) \ + do { \ + if (uvm_map_printlocks) \ + printf _args; \ + } while (0) +#else +#define LPRINTF(_args) do {} while (0) +#endif -static __inline void -uvm_rb_augment(struct vm_map_entry *entry) -{ - entry->space = uvm_rb_subtree_space(entry); -} +static struct timeval uvm_kmapent_last_warn_time; +static struct timeval uvm_kmapent_warn_rate = { 10, 0 }; -RB_PROTOTYPE(uvm_tree, vm_map_entry, rb_entry, uvm_compare); +const char vmmapbsy[] = "vmmapbsy"; -RB_GENERATE(uvm_tree, vm_map_entry, rb_entry, uvm_compare); +/* + * pool for vmspace structures. + */ +struct pool uvm_vmspace_pool; -vsize_t -uvm_rb_space(struct vm_map *map, struct vm_map_entry *entry) -{ - struct vm_map_entry *next; - vaddr_t space; +/* + * pool for dynamically-allocated map entries. + */ +struct pool uvm_map_entry_pool; +struct pool uvm_map_entry_kmem_pool; - if ((next = entry->next) == &map->header) - space = map->max_offset - entry->end; - else { - KASSERT(next); - space = next->start - entry->end; - } - return (space); -} - -vsize_t -uvm_rb_subtree_space(struct vm_map_entry *entry) -{ - vaddr_t space, tmp; +/* + * This global represents the end of the kernel virtual address + * space. If we want to exceed this, we must grow the kernel + * virtual address space dynamically. + * + * Note, this variable is locked by kernel_map's lock. + */ +vaddr_t uvm_maxkaddr; - space = entry->ownspace; - if (RB_LEFT(entry, rb_entry)) { - tmp = RB_LEFT(entry, rb_entry)->space; - if (tmp > space) - space = tmp; - } +/* + * Locking predicate. + */ +#define UVM_MAP_REQ_WRITE(_map) \ + do { \ + if (((_map)->flags & VM_MAP_INTRSAFE) == 0) \ + rw_assert_wrlock(&(_map)->lock); \ + } while (0) - if (RB_RIGHT(entry, rb_entry)) { - tmp = RB_RIGHT(entry, rb_entry)->space; - if (tmp > space) - space = tmp; - } +/* + * Tree describing entries by address. + * + * Addresses are unique. + * Entries with start == end may only exist if they are the first entry + * (sorted by address) within a free-memory tree. + */ - return (space); +static __inline int +uvm_mapentry_addrcmp(struct vm_map_entry *e1, struct vm_map_entry *e2) +{ + return e1->start < e2->start ? -1 : e1->start > e2->start; } -void -uvm_rb_fixup(struct vm_map *map, struct vm_map_entry *entry) +/* + * Tree describing free memory. + * + * Free memory is indexed (so we can use array semantics in O(log N). + * Free memory is ordered by size (so we can reduce fragmentation). + * + * The address range in the tree can be limited, having part of the + * free memory not in the free-memory tree. Only free memory in the + * tree will be considered during 'any address' allocations. + */ + +static __inline int +uvm_mapentry_freecmp(struct vm_map_entry *e1, struct vm_map_entry *e2) { - /* We need to traverse to the very top */ - do { - entry->ownspace = uvm_rb_space(map, entry); - entry->space = uvm_rb_subtree_space(entry); - } while ((entry = RB_PARENT(entry, rb_entry)) != NULL); + int cmp = e1->fspace < e2->fspace ? -1 : e1->fspace > e2->fspace; + return cmp ? cmp : uvm_mapentry_addrcmp(e1, e2); } -void -uvm_rb_insert(struct vm_map *map, struct vm_map_entry *entry) +/* + * Copy mapentry. + */ +static __inline void +uvm_mapent_copy(struct vm_map_entry *src, struct vm_map_entry *dst) { - vaddr_t space = uvm_rb_space(map, entry); - struct vm_map_entry *tmp; + caddr_t csrc, cdst; + size_t sz; - entry->ownspace = entry->space = space; - tmp = RB_INSERT(uvm_tree, &(map)->rbhead, entry); -#ifdef DIAGNOSTIC - if (tmp != NULL) - panic("uvm_rb_insert: duplicate entry?"); -#endif - uvm_rb_fixup(map, entry); - if (entry->prev != &map->header) - uvm_rb_fixup(map, entry->prev); + csrc = (caddr_t)src; + cdst = (caddr_t)dst; + csrc += offsetof(struct vm_map_entry, uvm_map_entry_start_copy); + cdst += offsetof(struct vm_map_entry, uvm_map_entry_start_copy); + + sz = offsetof(struct vm_map_entry, uvm_map_entry_stop_copy) - + offsetof(struct vm_map_entry, uvm_map_entry_start_copy); + memcpy(cdst, csrc, sz); } +/* + * Handle free-list insertion. + */ void -uvm_rb_remove(struct vm_map *map, struct vm_map_entry *entry) +uvm_mapent_free_insert(struct vm_map *map, struct uvm_addr_state *uaddr, + struct vm_map_entry *entry) { - struct vm_map_entry *parent; - - parent = RB_PARENT(entry, rb_entry); - RB_REMOVE(uvm_tree, &(map)->rbhead, entry); - if (entry->prev != &map->header) - uvm_rb_fixup(map, entry->prev); - if (parent) - uvm_rb_fixup(map, parent); -} - -#ifdef DEBUG -#define uvm_tree_sanity(x,y) _uvm_tree_sanity(x,y) -#else -#define uvm_tree_sanity(x,y) + const struct uvm_addr_functions *fun; +#ifdef VMMAP_DEBUG + vaddr_t min, max, bound; #endif -#ifdef DEBUG -int -_uvm_tree_sanity(struct vm_map *map, const char *name) -{ - struct vm_map_entry *tmp, *trtmp; - int n = 0, i = 1; - - RB_FOREACH(tmp, uvm_tree, &map->rbhead) { - if (tmp->ownspace != uvm_rb_space(map, tmp)) { - printf("%s: %d/%d ownspace %x != %x %s\n", - name, n + 1, map->nentries, - tmp->ownspace, uvm_rb_space(map, tmp), - tmp->next == &map->header ? "(last)" : ""); - goto error; - } - } - trtmp = NULL; - RB_FOREACH(tmp, uvm_tree, &map->rbhead) { - if (tmp->space != uvm_rb_subtree_space(tmp)) { - printf("%s: space %d != %d\n", - name, tmp->space, uvm_rb_subtree_space(tmp)); - goto error; - } - if (trtmp != NULL && trtmp->start >= tmp->start) { - printf("%s: corrupt: 0x%lx >= 0x%lx\n", - name, trtmp->start, tmp->start); - goto error; - } - n++; - - trtmp = tmp; - } +#ifdef VMMAP_DEBUG + /* + * Boundary check. + * Boundaries are folded if they go on the same free list. + */ + min = VMMAP_FREE_START(entry); + max = VMMAP_FREE_END(entry); - if (n != map->nentries) { - printf("%s: nentries: %d vs %d\n", - name, n, map->nentries); - goto error; + while (min < max) { + bound = uvm_map_boundary(map, min, max); + KASSERT(uvm_map_uaddr(map, min) == uaddr); + min = bound; } +#endif + KDASSERT((entry->fspace & (vaddr_t)PAGE_MASK) == 0); + KASSERT((entry->etype & UVM_ET_FREEMAPPED) == 0); - for (tmp = map->header.next; tmp && tmp != &map->header; - tmp = tmp->next, i++) { - trtmp = RB_FIND(uvm_tree, &map->rbhead, tmp); - if (trtmp != tmp) { - printf("%s: lookup: %d: %p - %p: %p\n", - name, i, tmp, trtmp, - RB_PARENT(tmp, rb_entry)); - goto error; - } - } + UVM_MAP_REQ_WRITE(map); - return (0); - error: -#ifdef DDB - /* handy breakpoint location for error case */ - __asm(".globl treesanity_label\ntreesanity_label:"); -#endif - return (-1); + /* Actual insert: forward to uaddr pointer. */ + fun = uaddr->uaddr_functions; + KDASSERT(fun != NULL); + if (fun->uaddr_free_insert != NULL) + (*fun->uaddr_free_insert)(map, uaddr, entry); + entry->etype |= UVM_ET_FREEMAPPED; } -#endif /* - * uvm_mapent_alloc: allocate a map entry + * Handle free-list removal. */ - -struct vm_map_entry * -uvm_mapent_alloc(struct vm_map *map, int flags) +void +uvm_mapent_free_remove(struct vm_map *map, struct uvm_addr_state *uaddr, + struct vm_map_entry *entry) { - struct vm_map_entry *me, *ne; - int s, i; - int pool_flags; - - pool_flags = PR_WAITOK; - if (flags & UVM_FLAG_TRYLOCK) - pool_flags = PR_NOWAIT; + const struct uvm_addr_functions *fun; - if (map->flags & VM_MAP_INTRSAFE || cold) { - s = splvm(); - simple_lock(&uvm.kentry_lock); - me = uvm.kentry_free; - if (me == NULL) { - ne = km_alloc(PAGE_SIZE, &kv_page, &kp_dirty, - &kd_nowait); - if (ne == NULL) - panic("uvm_mapent_alloc: cannot allocate map " - "entry"); - for (i = 0; - i < PAGE_SIZE / sizeof(struct vm_map_entry) - 1; - i++) - ne[i].next = &ne[i + 1]; - ne[i].next = NULL; - me = ne; - if (ratecheck(&uvm_kmapent_last_warn_time, - &uvm_kmapent_warn_rate)) - printf("uvm_mapent_alloc: out of static " - "map entries\n"); - } - uvm.kentry_free = me->next; - uvmexp.kmapent++; - simple_unlock(&uvm.kentry_lock); - splx(s); - me->flags = UVM_MAP_STATIC; - } else if (map == kernel_map) { - splassert(IPL_NONE); - me = pool_get(&uvm_map_entry_kmem_pool, pool_flags); - if (me == NULL) - goto out; - me->flags = UVM_MAP_KMEM; - } else { - splassert(IPL_NONE); - me = pool_get(&uvm_map_entry_pool, pool_flags); - if (me == NULL) - goto out; - me->flags = 0; - } + KASSERT((entry->etype & UVM_ET_FREEMAPPED) != 0); + KASSERT(uvm_map_uaddr_e(map, entry) == uaddr); + UVM_MAP_REQ_WRITE(map); -out: - return(me); + fun = uaddr->uaddr_functions; + if (fun->uaddr_free_remove != NULL) + (*fun->uaddr_free_remove)(map, uaddr, entry); + entry->etype &= ~UVM_ET_FREEMAPPED; } /* - * uvm_mapent_free: free map entry - * - * => XXX: static pool for kernel map? + * Handle address tree insertion. */ - void -uvm_mapent_free(struct vm_map_entry *me) +uvm_mapent_addr_insert(struct vm_map *map, struct vm_map_entry *entry) { - int s; - - if (me->flags & UVM_MAP_STATIC) { - s = splvm(); - simple_lock(&uvm.kentry_lock); - me->next = uvm.kentry_free; - uvm.kentry_free = me; - uvmexp.kmapent--; - simple_unlock(&uvm.kentry_lock); - splx(s); - } else if (me->flags & UVM_MAP_KMEM) { - splassert(IPL_NONE); - pool_put(&uvm_map_entry_kmem_pool, me); - } else { - splassert(IPL_NONE); - pool_put(&uvm_map_entry_pool, me); + struct vm_map_entry *res; + + if (RB_LEFT(entry, daddrs.addr_entry) != UVMMAP_DEADBEEF || + RB_RIGHT(entry, daddrs.addr_entry) != UVMMAP_DEADBEEF || + RB_PARENT(entry, daddrs.addr_entry) != UVMMAP_DEADBEEF) + panic("uvm_mapent_addr_insert: entry still in addr list"); + KDASSERT(entry->start <= entry->end); + KDASSERT((entry->start & (vaddr_t)PAGE_MASK) == 0 && + (entry->end & (vaddr_t)PAGE_MASK) == 0); + + UVM_MAP_REQ_WRITE(map); + res = RB_INSERT(uvm_map_addr, &map->addr, entry); + if (res != NULL) { + panic("uvm_mapent_addr_insert: map %p entry %p " + "(0x%lx-0x%lx G=0x%lx F=0x%lx) insert collision " + "with entry %p (0x%lx-0x%lx G=0x%lx F=0x%lx)", + map, entry, + entry->start, entry->end, entry->guard, entry->fspace, + res, res->start, res->end, res->guard, res->fspace); } } /* - * uvm_mapent_copy: copy a map entry, preserving flags + * Handle address tree removal. */ - void -uvm_mapent_copy(struct vm_map_entry *src, struct vm_map_entry *dst) +uvm_mapent_addr_remove(struct vm_map *map, struct vm_map_entry *entry) { - memcpy(dst, src, ((char *)&src->uvm_map_entry_stop_copy) - - ((char *)src)); + struct vm_map_entry *res; + + UVM_MAP_REQ_WRITE(map); + res = RB_REMOVE(uvm_map_addr, &map->addr, entry); + if (res != entry) + panic("uvm_mapent_addr_remove"); + RB_LEFT(entry, daddrs.addr_entry) = RB_RIGHT(entry, daddrs.addr_entry) = + RB_PARENT(entry, daddrs.addr_entry) = UVMMAP_DEADBEEF; } /* - * uvm_map_entry_unwire: unwire a map entry + * uvm_map_reference: add reference to a map * - * => map should be locked by caller + * XXX check map reference counter lock */ -void -uvm_map_entry_unwire(struct vm_map *map, struct vm_map_entry *entry) +#define uvm_map_reference(_map) \ + do { \ + simple_lock(&map->ref_lock); \ + map->ref_count++; \ + simple_unlock(&map->ref_lock); \ + } while (0) + +/* + * Calculate the dused delta. + */ +vsize_t +uvmspace_dused(struct vm_map *map, vaddr_t min, vaddr_t max) { + struct vmspace *vm; + vsize_t sz; + vaddr_t lmax; + vaddr_t stack_begin, stack_end; /* Position of stack. */ + + KASSERT(map->flags & VM_MAP_ISVMSPACE); + vm = (struct vmspace *)map; + stack_begin = MIN((vaddr_t)vm->vm_maxsaddr, (vaddr_t)vm->vm_minsaddr); + stack_end = MAX((vaddr_t)vm->vm_maxsaddr, (vaddr_t)vm->vm_minsaddr); + + sz = 0; + while (min != max) { + lmax = max; + if (min < stack_begin && lmax > stack_begin) + lmax = stack_begin; + else if (min < stack_end && lmax > stack_end) + lmax = stack_end; + + if (min >= stack_begin && min < stack_end) { + /* nothing */ + } else + sz += lmax - min; + min = lmax; + } - entry->wired_count = 0; - uvm_fault_unwire_locked(map, entry->start, entry->end); + return sz >> PAGE_SHIFT; } - /* - * wrapper for calling amap_ref() + * Find the entry describing the given address. */ -void -uvm_map_reference_amap(struct vm_map_entry *entry, int flags) +struct vm_map_entry* +uvm_map_entrybyaddr(struct uvm_map_addr *atree, vaddr_t addr) { - amap_ref(entry->aref.ar_amap, entry->aref.ar_pageoff, - (entry->end - entry->start) >> PAGE_SHIFT, flags); + struct vm_map_entry *iter; + + iter = RB_ROOT(atree); + while (iter != NULL) { + if (iter->start > addr) + iter = RB_LEFT(iter, daddrs.addr_entry); + else if (VMMAP_FREE_END(iter) <= addr) + iter = RB_RIGHT(iter, daddrs.addr_entry); + else + return iter; + } + return NULL; } - /* - * wrapper for calling amap_unref() + * DEAD_ENTRY_PUSH(struct vm_map_deadq *deadq, struct vm_map_entry *entry) + * + * Push dead entries into a linked list. + * Since the linked list abuses the address tree for storage, the entry + * may not be linked in a map. + * + * *head must be initialized to NULL before the first call to this macro. + * uvm_unmap_detach(*head, 0) will remove dead entries. */ -void -uvm_map_unreference_amap(struct vm_map_entry *entry, int flags) +static __inline void +dead_entry_push(struct uvm_map_deadq *deadq, struct vm_map_entry *entry) { - amap_unref(entry->aref.ar_amap, entry->aref.ar_pageoff, - (entry->end - entry->start) >> PAGE_SHIFT, flags); + TAILQ_INSERT_TAIL(deadq, entry, dfree.deadq); } - +#define DEAD_ENTRY_PUSH(_headptr, _entry) \ + dead_entry_push((_headptr), (_entry)) /* - * uvm_map_init: init mapping system at boot time. note that we allocate - * and init the static pool of structs vm_map_entry for the kernel here. + * Helper function for uvm_map_findspace_tree. + * + * Given allocation constraints and pmap constraints, finds the + * lowest and highest address in a range that can be used for the + * allocation. + * + * pmap_align and pmap_off are ignored on non-PMAP_PREFER archs. + * + * + * Big chunk of math with a seasoning of dragons. */ - -void -uvm_map_init(void) +int +uvm_map_sel_limits(vaddr_t *min, vaddr_t *max, vsize_t sz, int guardpg, + struct vm_map_entry *sel, vaddr_t align, + vaddr_t pmap_align, vaddr_t pmap_off, int bias) { - static struct vm_map_entry kernel_map_entry[MAX_KMAPENT]; - int lcv; + vaddr_t sel_min, sel_max; +#ifdef PMAP_PREFER + vaddr_t pmap_min, pmap_max; +#endif /* PMAP_PREFER */ +#ifdef DIAGNOSTIC + int bad; +#endif /* DIAGNOSTIC */ - /* - * set up static pool of kernel map entries ... - */ + sel_min = VMMAP_FREE_START(sel); + sel_max = VMMAP_FREE_END(sel) - sz - (guardpg ? PAGE_SIZE : 0); - simple_lock_init(&uvm.kentry_lock); - uvm.kentry_free = NULL; - for (lcv = 0 ; lcv < MAX_KMAPENT ; lcv++) { - kernel_map_entry[lcv].next = uvm.kentry_free; - uvm.kentry_free = &kernel_map_entry[lcv]; - } +#ifdef PMAP_PREFER /* - * initialize the map-related pools. + * There are two special cases, in which we can satisfy the align + * requirement and the pmap_prefer requirement. + * - when pmap_off == 0, we always select the largest of the two + * - when pmap_off % align == 0 and pmap_align > align, we simply + * satisfy the pmap_align requirement and automatically + * satisfy the align requirement. */ - pool_init(&uvm_vmspace_pool, sizeof(struct vmspace), - 0, 0, 0, "vmsppl", &pool_allocator_nointr); - pool_init(&uvm_map_entry_pool, sizeof(struct vm_map_entry), - 0, 0, 0, "vmmpepl", &pool_allocator_nointr); - pool_init(&uvm_map_entry_kmem_pool, sizeof(struct vm_map_entry), - 0, 0, 0, "vmmpekpl", NULL); - pool_sethiwat(&uvm_map_entry_pool, 8192); -} - -/* - * clippers - */ + if (align > PAGE_SIZE && + !(pmap_align > align && (pmap_off & (align - 1)) == 0)) { + /* + * Simple case: only use align. + */ + sel_min = roundup(sel_min, align); + sel_max &= ~(align - 1); -/* - * uvm_map_clip_start: ensure that the entry begins at or after - * the starting address, if it doesn't we split the entry. - * - * => caller should use UVM_MAP_CLIP_START macro rather than calling - * this directly - * => map must be locked by caller - */ + if (sel_min > sel_max) + return ENOMEM; -void -uvm_map_clip_start(struct vm_map *map, struct vm_map_entry *entry, - vaddr_t start) -{ - struct vm_map_entry *new_entry; - vaddr_t new_adj; + /* + * Correct for bias. + */ + if (sel_max - sel_min > FSPACE_BIASGAP) { + if (bias > 0) { + sel_min = sel_max - FSPACE_BIASGAP; + sel_min = roundup(sel_min, align); + } else if (bias < 0) { + sel_max = sel_min + FSPACE_BIASGAP; + sel_max &= ~(align - 1); + } + } + } else if (pmap_align != 0) { + /* + * Special case: satisfy both pmap_prefer and + * align argument. + */ + pmap_max = sel_max & ~(pmap_align - 1); + pmap_min = sel_min; + if (pmap_max < sel_min) + return ENOMEM; + + /* Adjust pmap_min for BIASGAP for top-addr bias. */ + if (bias > 0 && pmap_max - pmap_min > FSPACE_BIASGAP) + pmap_min = pmap_max - FSPACE_BIASGAP; + /* Align pmap_min. */ + pmap_min &= ~(pmap_align - 1); + if (pmap_min < sel_min) + pmap_min += pmap_align; + if (pmap_min > pmap_max) + return ENOMEM; + + /* Adjust pmap_max for BIASGAP for bottom-addr bias. */ + if (bias < 0 && pmap_max - pmap_min > FSPACE_BIASGAP) { + pmap_max = (pmap_min + FSPACE_BIASGAP) & + ~(pmap_align - 1); + } + if (pmap_min > pmap_max) + return ENOMEM; - /* uvm_map_simplify_entry(map, entry); */ /* XXX */ + /* Apply pmap prefer offset. */ + pmap_max |= pmap_off; + if (pmap_max > sel_max) + pmap_max -= pmap_align; + pmap_min |= pmap_off; + if (pmap_min < sel_min) + pmap_min += pmap_align; - uvm_tree_sanity(map, "clip_start entry"); + /* + * Fixup: it's possible that pmap_min and pmap_max + * cross eachother. In this case, try to find one + * address that is allowed. + * (This usually happens in biased case.) + */ + if (pmap_min > pmap_max) { + if (pmap_min < sel_max) + pmap_max = pmap_min; + else if (pmap_max > sel_min) + pmap_min = pmap_max; + else + return ENOMEM; + } - /* - * Split off the front portion. note that we must insert the new - * entry BEFORE this one, so that this entry has the specified - * starting address. - */ + /* Internal validation. */ + KDASSERT(pmap_min <= pmap_max); - new_entry = uvm_mapent_alloc(map, 0); - uvm_mapent_copy(entry, new_entry); /* entry -> new_entry */ + sel_min = pmap_min; + sel_max = pmap_max; + } else if (bias > 0 && sel_max - sel_min > FSPACE_BIASGAP) + sel_min = sel_max - FSPACE_BIASGAP; + else if (bias < 0 && sel_max - sel_min > FSPACE_BIASGAP) + sel_max = sel_min + FSPACE_BIASGAP; - new_entry->end = start; - new_adj = start - new_entry->start; - if (entry->object.uvm_obj) - entry->offset += new_adj; /* shift start over */ +#else - /* Does not change order for the RB tree */ - entry->start = start; + if (align > PAGE_SIZE) { + sel_min = roundup(sel_min, align); + sel_max &= ~(align - 1); + if (sel_min > sel_max) + return ENOMEM; - if (new_entry->aref.ar_amap) { - amap_splitref(&new_entry->aref, &entry->aref, new_adj); - } + if (bias != 0 && sel_max - sel_min > FSPACE_BIASGAP) { + if (bias > 0) { + sel_min = roundup(sel_max - FSPACE_BIASGAP, + align); + } else { + sel_max = (sel_min + FSPACE_BIASGAP) & + ~(align - 1); + } + } + } else if (bias > 0 && sel_max - sel_min > FSPACE_BIASGAP) + sel_min = sel_max - FSPACE_BIASGAP; + else if (bias < 0 && sel_max - sel_min > FSPACE_BIASGAP) + sel_max = sel_min + FSPACE_BIASGAP; - uvm_map_entry_link(map, entry->prev, new_entry); +#endif - if (UVM_ET_ISSUBMAP(entry)) { - /* ... unlikely to happen, but play it safe */ - uvm_map_reference(new_entry->object.sub_map); - } else { - if (UVM_ET_ISOBJ(entry) && - entry->object.uvm_obj->pgops && - entry->object.uvm_obj->pgops->pgo_reference) - entry->object.uvm_obj->pgops->pgo_reference( - entry->object.uvm_obj); + if (sel_min > sel_max) + return ENOMEM; + +#ifdef DIAGNOSTIC + bad = 0; + /* Lower boundary check. */ + if (sel_min < VMMAP_FREE_START(sel)) { + printf("sel_min: 0x%lx, but should be at least 0x%lx\n", + sel_min, VMMAP_FREE_START(sel)); + bad++; + } + /* Upper boundary check. */ + if (sel_max > VMMAP_FREE_END(sel) - sz - (guardpg ? PAGE_SIZE : 0)) { + printf("sel_max: 0x%lx, but should be at most 0x%lx\n", + sel_max, + VMMAP_FREE_END(sel) - sz - (guardpg ? PAGE_SIZE : 0)); + bad++; + } + /* Lower boundary alignment. */ + if (align != 0 && (sel_min & (align - 1)) != 0) { + printf("sel_min: 0x%lx, not aligned to 0x%lx\n", + sel_min, align); + bad++; + } + /* Upper boundary alignment. */ + if (align != 0 && (sel_max & (align - 1)) != 0) { + printf("sel_max: 0x%lx, not aligned to 0x%lx\n", + sel_max, align); + bad++; + } + /* Lower boundary PMAP_PREFER check. */ + if (pmap_align != 0 && align == 0 && + (sel_min & (pmap_align - 1)) != pmap_off) { + printf("sel_min: 0x%lx, aligned to 0x%lx, expected 0x%lx\n", + sel_min, sel_min & (pmap_align - 1), pmap_off); + bad++; + } + /* Upper boundary PMAP_PREFER check. */ + if (pmap_align != 0 && align == 0 && + (sel_max & (pmap_align - 1)) != pmap_off) { + printf("sel_max: 0x%lx, aligned to 0x%lx, expected 0x%lx\n", + sel_max, sel_max & (pmap_align - 1), pmap_off); + bad++; + } + + if (bad) { + panic("uvm_map_sel_limits(sz = %lu, guardpg = %c, " + "align = 0x%lx, pmap_align = 0x%lx, pmap_off = 0x%lx, " + "bias = %d, " + "FREE_START(sel) = 0x%lx, FREE_END(sel) = 0x%lx)", + sz, (guardpg ? 'T' : 'F'), align, pmap_align, pmap_off, + bias, VMMAP_FREE_START(sel), VMMAP_FREE_END(sel)); } +#endif /* DIAGNOSTIC */ - uvm_tree_sanity(map, "clip_start leave"); + *min = sel_min; + *max = sel_max; + return 0; } /* - * uvm_map_clip_end: ensure that the entry ends at or before - * the ending address, if it doesn't we split the reference - * - * => caller should use UVM_MAP_CLIP_END macro rather than calling - * this directly - * => map must be locked by caller + * Test if memory starting at addr with sz bytes is free. + * + * Fills in *start_ptr and *end_ptr to be the first and last entry describing + * the space. + * If called with prefilled *start_ptr and *end_ptr, they are to be correct. */ - -void -uvm_map_clip_end(struct vm_map *map, struct vm_map_entry *entry, vaddr_t end) +int +uvm_map_isavail(struct vm_map *map, struct uvm_addr_state *uaddr, + struct vm_map_entry **start_ptr, struct vm_map_entry **end_ptr, + vaddr_t addr, vsize_t sz) { - struct vm_map_entry *new_entry; - vaddr_t new_adj; /* #bytes we move start forward */ + struct uvm_addr_state *free; + struct uvm_map_addr *atree; + struct vm_map_entry *i, *i_end; - uvm_tree_sanity(map, "clip_end entry"); /* - * Create a new entry and insert it - * AFTER the specified entry + * Kernel memory above uvm_maxkaddr is considered unavailable. */ + if ((map->flags & VM_MAP_ISVMSPACE) == 0) { + if (addr + sz > uvm_maxkaddr) + return 0; + } - new_entry = uvm_mapent_alloc(map, 0); - uvm_mapent_copy(entry, new_entry); /* entry -> new_entry */ + atree = &map->addr; - new_entry->start = entry->end = end; - new_adj = end - entry->start; - if (new_entry->object.uvm_obj) - new_entry->offset += new_adj; + /* + * Fill in first, last, so they point at the entries containing the + * first and last address of the range. + * Note that if they are not NULL, we don't perform the lookup. + */ + KDASSERT(atree != NULL && start_ptr != NULL && end_ptr != NULL); + if (*start_ptr == NULL) { + *start_ptr = uvm_map_entrybyaddr(atree, addr); + if (*start_ptr == NULL) + return 0; + } else + KASSERT(*start_ptr == uvm_map_entrybyaddr(atree, addr)); + if (*end_ptr == NULL) { + if (VMMAP_FREE_END(*start_ptr) >= addr + sz) + *end_ptr = *start_ptr; + else { + *end_ptr = uvm_map_entrybyaddr(atree, addr + sz - 1); + if (*end_ptr == NULL) + return 0; + } + } else + KASSERT(*end_ptr == uvm_map_entrybyaddr(atree, addr + sz - 1)); - if (entry->aref.ar_amap) - amap_splitref(&entry->aref, &new_entry->aref, new_adj); - - uvm_rb_fixup(map, entry); + /* + * Validation. + */ + KDASSERT(*start_ptr != NULL && *end_ptr != NULL); + KDASSERT((*start_ptr)->start <= addr && + VMMAP_FREE_END(*start_ptr) > addr && + (*end_ptr)->start < addr + sz && + VMMAP_FREE_END(*end_ptr) >= addr + sz); - uvm_map_entry_link(map, entry, new_entry); + /* + * Check the none of the entries intersects with . + * Also, if the entry belong to uaddr_exe or uaddr_brk_stack, it is + * considered unavailable unless called by those allocators. + */ + i = *start_ptr; + i_end = RB_NEXT(uvm_map_addr, atree, *end_ptr); + for (; i != i_end; + i = RB_NEXT(uvm_map_addr, atree, i)) { + if (i->start != i->end && i->end > addr) + return 0; - if (UVM_ET_ISSUBMAP(entry)) { - /* ... unlikely to happen, but play it safe */ - uvm_map_reference(new_entry->object.sub_map); - } else { - if (UVM_ET_ISOBJ(entry) && - entry->object.uvm_obj->pgops && - entry->object.uvm_obj->pgops->pgo_reference) - entry->object.uvm_obj->pgops->pgo_reference( - entry->object.uvm_obj); + /* + * uaddr_exe and uaddr_brk_stack may only be used + * by these allocators and the NULL uaddr (i.e. no + * uaddr). + * Reject if this requirement is not met. + */ + if (uaddr != NULL) { + free = uvm_map_uaddr_e(map, i); + + if (uaddr != free && free != NULL && + (free == map->uaddr_exe || + free == map->uaddr_brk_stack)) + return 0; + } } - uvm_tree_sanity(map, "clip_end leave"); -} + return -1; +} /* - * M A P - m a i n e n t r y p o i n t - */ -/* - * uvm_map: establish a valid mapping in a map - * - * => assume startp is page aligned. - * => assume size is a multiple of PAGE_SIZE. - * => assume sys_mmap provides enough of a "hint" to have us skip - * over text/data/bss area. - * => map must be unlocked (we will lock it) - * => value meanings (4 cases): - * [1] == uoffset is a hint for PMAP_PREFER - * [2] == don't PMAP_PREFER - * [3] == normal mapping - * [4] == uvm_map finds offset based on VA - * - * case [4] is for kernel mappings where we don't know the offset until - * we've found a virtual address. note that kernel object offsets are - * always relative to vm_map_min(kernel_map). - * - * => if `align' is non-zero, we try to align the virtual address to - * the specified alignment. this is only a hint; if we can't - * do it, the address will be unaligned. this is provided as - * a mechanism for large pages. - * - * => XXXCDC: need way to map in external amap? + * Invoke each address selector until an address is found. + * Will not invoke uaddr_exe. */ - int -uvm_map_p(struct vm_map *map, vaddr_t *startp, vsize_t size, - struct uvm_object *uobj, voff_t uoffset, vsize_t align, uvm_flag_t flags, - struct proc *p) +uvm_map_findspace(struct vm_map *map, struct vm_map_entry**first, + struct vm_map_entry**last, vaddr_t *addr, vsize_t sz, + vaddr_t pmap_align, vaddr_t pmap_offset, vm_prot_t prot, vaddr_t hint) { - struct vm_map_entry *prev_entry, *new_entry; -#ifdef KVA_GUARDPAGES - struct vm_map_entry *guard_entry; -#endif - vm_prot_t prot = UVM_PROTECTION(flags), maxprot = - UVM_MAXPROTECTION(flags); - vm_inherit_t inherit = UVM_INHERIT(flags); - int advice = UVM_ADVICE(flags); - int error; + struct uvm_addr_state *uaddr; + int i; /* - * Holes are incompatible with other types of mappings. + * Allocation for sz bytes at any address, + * using the addr selectors in order. */ - if (flags & UVM_FLAG_HOLE) { - KASSERT(uobj == NULL && (flags & UVM_FLAG_FIXED) != 0 && - (flags & (UVM_FLAG_OVERLAY | UVM_FLAG_COPYONW)) == 0); - } + for (i = 0; i < nitems(map->uaddr_any); i++) { + uaddr = map->uaddr_any[i]; -#ifdef KVA_GUARDPAGES - if (map == kernel_map && !(flags & UVM_FLAG_FIXED)) { - /* - * kva_guardstart is initialized to the start of the kernelmap - * and cycles through the kva space. - * This way we should have a long time between re-use of kva. - */ - static vaddr_t kva_guardstart = 0; - if (kva_guardstart == 0) { - kva_guardstart = vm_map_min(map); - printf("uvm_map: kva guard pages enabled: %p\n", - kva_guardstart); - } - size += PAGE_SIZE; /* Add guard page at the end. */ - /* - * Try to fully exhaust kva prior to wrap-around. - * (This may eat your ram!) - */ - if (VM_MAX_KERNEL_ADDRESS - kva_guardstart < size) { - static int wrap_counter = 0; - printf("uvm_map: kva guard page wrap-around %d\n", - ++wrap_counter); - kva_guardstart = vm_map_min(map); - } - *startp = kva_guardstart; - /* - * Prepare for next round. - */ - kva_guardstart += size; + if (uvm_addr_invoke(map, uaddr, first, last, + addr, sz, pmap_align, pmap_offset, prot, hint) == 0) + return 0; } -#endif - uvm_tree_sanity(map, "map entry"); + /* + * Fall back to brk() and stack() address selectors. + */ + uaddr = map->uaddr_brk_stack; + if (uvm_addr_invoke(map, uaddr, first, last, + addr, sz, pmap_align, pmap_offset, prot, hint) == 0) + return 0; + + return ENOMEM; +} + +/* + * uvm_map: establish a valid mapping in map + * + * => *addr and sz must be a multiple of PAGE_SIZE. + * => *addr is ignored, except if flags contains UVM_FLAG_FIXED. + * => map must be unlocked. + * => value meanings (4 cases): + * [1] == uoffset is a hint for PMAP_PREFER + * [2] == don't PMAP_PREFER + * [3] == normal mapping + * [4] == uvm_map finds offset based on VA + * + * case [4] is for kernel mappings where we don't know the offset until + * we've found a virtual address. note that kernel object offsets are + * always relative to vm_map_min(kernel_map). + * + * => align: align vaddr, must be a power-of-2. + * Align is only a hint and will be ignored if the alignemnt fails. + */ +int +uvm_map(struct vm_map *map, vaddr_t *addr, vsize_t sz, + struct uvm_object *uobj, voff_t uoffset, vsize_t align, uvm_flag_t flags) +{ + struct vm_map_entry *first, *last, *entry; + struct uvm_map_deadq dead; + vm_prot_t prot; + vm_prot_t maxprot; + vm_inherit_t inherit; + int advice; + int error; + vaddr_t pmap_align, pmap_offset; + vaddr_t hint; if ((map->flags & VM_MAP_INTRSAFE) == 0) splassert(IPL_NONE); @@ -749,2147 +927,3122 @@ uvm_map_p(struct vm_map *map, vaddr_t *startp, vsize_t size, splassert(IPL_VM); /* - * step 0: sanity check of protection code + * We use pmap_align and pmap_offset as alignment and offset variables. + * + * Because the align parameter takes precedence over pmap prefer, + * the pmap_align will need to be set to align, with pmap_offset = 0, + * if pmap_prefer will not align. */ + if (uoffset == UVM_UNKNOWN_OFFSET) { + pmap_align = MAX(align, PAGE_SIZE); + pmap_offset = 0; + } else { + pmap_align = MAX(PMAP_PREFER_ALIGN(), PAGE_SIZE); + pmap_offset = PMAP_PREFER_OFFSET(uoffset); - if ((prot & maxprot) != prot) { - return (EACCES); + if (align == 0 || + (align <= pmap_align && (pmap_offset & (align - 1)) == 0)) { + /* + * pmap_offset satisfies align, no change. + */ + } else { + /* + * Align takes precedence over pmap prefer. + */ + pmap_align = align; + pmap_offset = 0; + } } /* - * step 1: figure out where to put new VM range + * Decode parameters. */ - - if (vm_map_lock_try(map) == FALSE) { - if (flags & UVM_FLAG_TRYLOCK) - return (EFAULT); - vm_map_lock(map); /* could sleep here */ - } - if ((prev_entry = uvm_map_findspace(map, *startp, size, startp, - uobj, uoffset, align, flags)) == NULL) { - vm_map_unlock(map); - return (ENOMEM); - } - -#ifdef PMAP_GROWKERNEL - { - /* - * If the kernel pmap can't map the requested space, - * then allocate more resources for it. - */ - if (map == kernel_map && !(flags & UVM_FLAG_FIXED) && - uvm_maxkaddr < (*startp + size)) - uvm_maxkaddr = pmap_growkernel(*startp + size); - } -#endif + prot = UVM_PROTECTION(flags); + maxprot = UVM_MAXPROTECTION(flags); + advice = UVM_ADVICE(flags); + inherit = UVM_INHERIT(flags); + error = 0; + hint = trunc_page(*addr); + TAILQ_INIT(&dead); + KASSERT((sz & (vaddr_t)PAGE_MASK) == 0); + KASSERT((align & (align - 1)) == 0); /* - * if uobj is null, then uoffset is either a VAC hint for PMAP_PREFER - * [typically from uvm_map_reserve] or it is UVM_UNKNOWN_OFFSET. in - * either case we want to zero it before storing it in the map entry - * (because it looks strange and confusing when debugging...) - * - * if uobj is not null - * if uoffset is not UVM_UNKNOWN_OFFSET then we have a normal mapping - * and we do not need to change uoffset. - * if uoffset is UVM_UNKNOWN_OFFSET then we need to find the offset - * now (based on the starting address of the map). this case is - * for kernel object mappings where we don't know the offset until - * the virtual address is found (with uvm_map_findspace). the - * offset is the distance we are from the start of the map. + * Holes are incompatible with other types of mappings. */ - - if (uobj == NULL) { - uoffset = 0; - } else { - if (uoffset == UVM_UNKNOWN_OFFSET) { - KASSERT(UVM_OBJ_IS_KERN_OBJECT(uobj)); - uoffset = *startp - vm_map_min(kernel_map); - } + if (flags & UVM_FLAG_HOLE) { + KASSERT(uobj == NULL && (flags & UVM_FLAG_FIXED) && + (flags & (UVM_FLAG_OVERLAY | UVM_FLAG_COPYONW)) == 0); } /* - * step 2: try and insert in map by extending previous entry, if - * possible - * XXX: we don't try and pull back the next entry. might be useful - * for a stack, but we are currently allocating our stack in advance. + * Unset hint for kernel_map non-fixed allocations. */ + if (!(map->flags & VM_MAP_ISVMSPACE) && !(flags & UVM_FLAG_FIXED)) + hint = 0; - if ((flags & UVM_FLAG_NOMERGE) == 0 && - prev_entry->end == *startp && prev_entry != &map->header && - prev_entry->object.uvm_obj == uobj) { - - if (uobj && prev_entry->offset + - (prev_entry->end - prev_entry->start) != uoffset) - goto step3; - - if (UVM_ET_ISSUBMAP(prev_entry)) - goto step3; - - if (prev_entry->protection != prot || - prev_entry->max_protection != maxprot) - goto step3; - - if (prev_entry->inheritance != inherit || - prev_entry->advice != advice) - goto step3; + /* + * Check protection. + */ + if ((prot & maxprot) != prot) + return EACCES; - /* wiring status must match (new area is unwired) */ - if (VM_MAPENT_ISWIRED(prev_entry)) - goto step3; + if (flags & UVM_FLAG_TRYLOCK) { + if (vm_map_lock_try(map) == FALSE) + return EFAULT; + } else + vm_map_lock(map); + first = last = NULL; + if (flags & UVM_FLAG_FIXED) { /* - * can't extend a shared amap. note: no need to lock amap to - * look at refs since we don't care about its exact value. - * if it is one (i.e. we have only reference) it will stay there + * Fixed location. + * + * Note: we ignore align, pmap_prefer. + * Fill in first, last and *addr. */ + KASSERT((*addr & PAGE_MASK) == 0); - if (prev_entry->aref.ar_amap && - amap_refs(prev_entry->aref.ar_amap) != 1) { - goto step3; + /* + * Grow pmap to include allocated address. + * If the growth fails, the allocation will fail too. + */ + if ((map->flags & VM_MAP_ISVMSPACE) == 0 && + uvm_maxkaddr < (*addr + sz)) { + uvm_map_kmem_grow(map, &dead, + *addr + sz - uvm_maxkaddr, flags); } /* - * Only merge kernel mappings, but keep track - * of how much we skipped. + * Check that the space is available. */ - if (map != kernel_map && map != kmem_map) { - goto step3; + if (!uvm_map_isavail(map, NULL, &first, &last, *addr, sz)) { + error = ENOMEM; + goto unlock; } + } else if (*addr != 0 && (*addr & PAGE_MASK) == 0 && + (map->flags & VM_MAP_ISVMSPACE) == VM_MAP_ISVMSPACE && + (align == 0 || (*addr & (align - 1)) == 0) && + uvm_map_isavail(map, NULL, &first, &last, *addr, sz)) { + /* + * Address used as hint. + * + * Note: we enforce the alignment restriction, + * but ignore pmap_prefer. + */ + } else if ((maxprot & VM_PROT_EXECUTE) != 0 && + map->uaddr_exe != NULL) { + /* + * Run selection algorithm for executables. + */ + error = uvm_addr_invoke(map, map->uaddr_exe, &first, &last, + addr, sz, pmap_align, pmap_offset, prot, hint); + + /* + * Grow kernel memory and try again. + */ + if (error != 0 && (map->flags & VM_MAP_ISVMSPACE) == 0) { + uvm_map_kmem_grow(map, &dead, sz, flags); - if (prev_entry->aref.ar_amap) { - error = amap_extend(prev_entry, size); - if (error) - goto step3; + error = uvm_addr_invoke(map, map->uaddr_exe, + &first, &last, addr, sz, + pmap_align, pmap_offset, prot, hint); } + if (error != 0) + goto unlock; + } else { /* - * drop our reference to uobj since we are extending a reference - * that we already have (the ref count can not drop to zero). + * Update freelists from vmspace. */ + if (map->flags & VM_MAP_ISVMSPACE) + uvm_map_vmspace_update(map, &dead, flags); - if (uobj && uobj->pgops->pgo_detach) - uobj->pgops->pgo_detach(uobj); - - prev_entry->end += size; - uvm_rb_fixup(map, prev_entry); - map->size += size; - if (p && uobj == NULL) - p->p_vmspace->vm_dused += atop(size); + error = uvm_map_findspace(map, &first, &last, addr, sz, + pmap_align, pmap_offset, prot, hint); - uvm_tree_sanity(map, "map leave 2"); + /* + * Grow kernel memory and try again. + */ + if (error != 0 && (map->flags & VM_MAP_ISVMSPACE) == 0) { + uvm_map_kmem_grow(map, &dead, sz, flags); - vm_map_unlock(map); - return (0); + error = uvm_map_findspace(map, &first, &last, addr, sz, + pmap_align, pmap_offset, prot, hint); + } + if (error != 0) + goto unlock; } -step3: + + KASSERT((map->flags & VM_MAP_ISVMSPACE) == VM_MAP_ISVMSPACE || + uvm_maxkaddr >= *addr + sz); /* - * step 3: allocate new entry and link it in + * If we only want a query, return now. */ - -#ifdef KVA_GUARDPAGES - if (map == kernel_map && !(flags & UVM_FLAG_FIXED)) - size -= PAGE_SIZE; -#endif - - new_entry = uvm_mapent_alloc(map, flags); - if (new_entry == NULL) { - vm_map_unlock(map); - return (ENOMEM); + if (flags & UVM_FLAG_QUERY) { + error = 0; + goto unlock; } - new_entry->start = *startp; - new_entry->end = new_entry->start + size; - new_entry->object.uvm_obj = uobj; - new_entry->offset = uoffset; - if (uobj) - new_entry->etype = UVM_ET_OBJ; - else - new_entry->etype = 0; + if (uobj == NULL) + uoffset = 0; + else if (uoffset == UVM_UNKNOWN_OFFSET) { + KASSERT(UVM_OBJ_IS_KERN_OBJECT(uobj)); + uoffset = *addr - vm_map_min(kernel_map); + } + /* + * Create new entry. + * first and last may be invalidated after this call. + */ + entry = uvm_map_mkentry(map, first, last, *addr, sz, flags, &dead); + if (entry == NULL) { + error = ENOMEM; + goto unlock; + } + KDASSERT(entry->start == *addr && entry->end == *addr + sz); + entry->object.uvm_obj = uobj; + entry->offset = uoffset; + entry->protection = prot; + entry->max_protection = maxprot; + entry->inheritance = inherit; + entry->wired_count = 0; + entry->advice = advice; + if (uobj) + entry->etype |= UVM_ET_OBJ; + else if (flags & UVM_FLAG_HOLE) + entry->etype |= UVM_ET_HOLE; if (flags & UVM_FLAG_COPYONW) { - new_entry->etype |= UVM_ET_COPYONWRITE; + entry->etype |= UVM_ET_COPYONWRITE; if ((flags & UVM_FLAG_OVERLAY) == 0) - new_entry->etype |= UVM_ET_NEEDSCOPY; + entry->etype |= UVM_ET_NEEDSCOPY; } - if (flags & UVM_FLAG_HOLE) - new_entry->etype |= UVM_ET_HOLE; - - new_entry->protection = prot; - new_entry->max_protection = maxprot; - new_entry->inheritance = inherit; - new_entry->wired_count = 0; - new_entry->advice = advice; if (flags & UVM_FLAG_OVERLAY) { - /* - * to_add: for BSS we overallocate a little since we - * are likely to extend - */ - vaddr_t to_add = (flags & UVM_FLAG_AMAPPAD) ? - UVM_AMAP_CHUNK << PAGE_SHIFT : 0; - struct vm_amap *amap = amap_alloc(size, to_add, M_WAITOK); - new_entry->aref.ar_pageoff = 0; - new_entry->aref.ar_amap = amap; - } else { - new_entry->aref.ar_pageoff = 0; - new_entry->aref.ar_amap = NULL; + entry->aref.ar_pageoff = 0; + entry->aref.ar_amap = amap_alloc(sz, + ptoa(flags & UVM_FLAG_AMAPPAD ? UVM_AMAP_CHUNK : 0), + M_WAITOK); } - uvm_map_entry_link(map, prev_entry, new_entry); - - map->size += size; - if (p && uobj == NULL) - p->p_vmspace->vm_dused += atop(size); - - - /* - * Update the free space hint - */ - - if ((map->first_free == prev_entry) && - (prev_entry->end >= new_entry->start)) - map->first_free = new_entry; - -#ifdef KVA_GUARDPAGES /* - * Create the guard entry. + * Update map and process statistics. */ - if (map == kernel_map && !(flags & UVM_FLAG_FIXED)) { - guard_entry = uvm_mapent_alloc(map, flags); - if (guard_entry != NULL) { - guard_entry->start = new_entry->end; - guard_entry->end = guard_entry->start + PAGE_SIZE; - guard_entry->object.uvm_obj = uobj; - guard_entry->offset = uoffset; - guard_entry->etype = MAP_ET_KVAGUARD; - guard_entry->protection = prot; - guard_entry->max_protection = maxprot; - guard_entry->inheritance = inherit; - guard_entry->wired_count = 0; - guard_entry->advice = advice; - guard_entry->aref.ar_pageoff = 0; - guard_entry->aref.ar_amap = NULL; - uvm_map_entry_link(map, new_entry, guard_entry); - map->size += PAGE_SIZE; - kva_guardpages++; + if (!(flags & UVM_FLAG_HOLE)) { + map->size += sz; + if ((map->flags & VM_MAP_ISVMSPACE) && uobj == NULL) { + ((struct vmspace *)map)->vm_dused += + uvmspace_dused(map, *addr, *addr + sz); } } -#endif - uvm_tree_sanity(map, "map leave"); + /* + * Try to merge entry. + * + * Userland allocations are kept separated most of the time. + * Forego the effort of merging what most of the time can't be merged + * and only try the merge if it concerns a kernel entry. + */ + if ((flags & UVM_FLAG_NOMERGE) == 0 && + (map->flags & VM_MAP_ISVMSPACE) == 0) + uvm_mapent_tryjoin(map, entry, &dead); +unlock: vm_map_unlock(map); - return (0); + + /* + * Remove dead entries. + * + * Dead entries may be the result of merging. + * uvm_map_mkentry may also create dead entries, when it attempts to + * destroy free-space entries. + */ + uvm_unmap_detach(&dead, 0); + return error; } /* - * uvm_map_lookup_entry: find map entry at or before an address - * - * => map must at least be read-locked by caller - * => entry is returned in "entry" - * => return value is true if address is in the returned entry + * True iff e1 and e2 can be joined together. */ - -boolean_t -uvm_map_lookup_entry(struct vm_map *map, vaddr_t address, - struct vm_map_entry **entry) +int +uvm_mapent_isjoinable(struct vm_map *map, struct vm_map_entry *e1, + struct vm_map_entry *e2) { - struct vm_map_entry *cur; - struct vm_map_entry *last; - int use_tree = 0; + KDASSERT(e1 != NULL && e2 != NULL); /* - * start looking either from the head of the - * list, or from the hint. + * Must be the same entry type and not have free memory between. */ + if (e1->etype != e2->etype || e1->end != e2->start) + return 0; - simple_lock(&map->hint_lock); - cur = map->hint; - simple_unlock(&map->hint_lock); - - if (cur == &map->header) - cur = cur->next; - - if (address >= cur->start) { - /* - * go from hint to end of list. - * - * but first, make a quick check to see if - * we are already looking at the entry we - * want (which is usually the case). - * note also that we don't need to save the hint - * here... it is the same hint (unless we are - * at the header, in which case the hint didn't - * buy us anything anyway). - */ - last = &map->header; - if ((cur != last) && (cur->end > address)) { - *entry = cur; - return (TRUE); - } - - if (map->nentries > 30) - use_tree = 1; - } else { - /* - * go from start to hint, *inclusively* - */ - last = cur->next; - cur = map->header.next; - use_tree = 1; - } + /* + * Submaps are never joined. + */ + if (UVM_ET_ISSUBMAP(e1)) + return 0; - uvm_tree_sanity(map, __func__); + /* + * Never merge wired memory. + */ + if (VM_MAPENT_ISWIRED(e1) || VM_MAPENT_ISWIRED(e2)) + return 0; - if (use_tree) { - struct vm_map_entry *prev = &map->header; - cur = RB_ROOT(&map->rbhead); + /* + * Protection, inheritance and advice must be equal. + */ + if (e1->protection != e2->protection || + e1->max_protection != e2->max_protection || + e1->inheritance != e2->inheritance || + e1->advice != e2->advice) + return 0; - /* - * Simple lookup in the tree. Happens when the hint is - * invalid, or nentries reach a threshold. - */ - while (cur) { - if (address >= cur->start) { - if (address < cur->end) { - *entry = cur; - SAVE_HINT(map, map->hint, cur); - return (TRUE); - } - prev = cur; - cur = RB_RIGHT(cur, rb_entry); - } else - cur = RB_LEFT(cur, rb_entry); - } - *entry = prev; - return (FALSE); + /* + * If uvm_object: objects itself and offsets within object must match. + */ + if (UVM_ET_ISOBJ(e1)) { + if (e1->object.uvm_obj != e2->object.uvm_obj) + return 0; + if (e1->offset + (e1->end - e1->start) != e2->offset) + return 0; } /* - * search linearly + * Cannot join shared amaps. + * Note: no need to lock amap to look at refs, since we don't care + * about its exact value. + * If it is 1 (i.e. we have the only reference) it will stay there. */ + if (e1->aref.ar_amap && amap_refs(e1->aref.ar_amap) != 1) + return 0; + if (e2->aref.ar_amap && amap_refs(e2->aref.ar_amap) != 1) + return 0; - while (cur != last) { - if (cur->end > address) { - if (address >= cur->start) { - /* - * save this lookup for future - * hints, and return - */ - - *entry = cur; - SAVE_HINT(map, map->hint, cur); - return (TRUE); - } - break; - } - cur = cur->next; - } - - *entry = cur->prev; - SAVE_HINT(map, map->hint, *entry); - return (FALSE); + /* + * Apprently, e1 and e2 match. + */ + return 1; } /* - * Checks if address pointed to by phint fits into the empty - * space before the vm_map_entry after. Takes alignment and - * offset into consideration. + * Join support function. + * + * Returns the merged entry on succes. + * Returns NULL if the merge failed. */ - -int -uvm_map_spacefits(struct vm_map *map, vaddr_t *phint, vsize_t length, - struct vm_map_entry *after, voff_t uoffset, vsize_t align) +struct vm_map_entry* +uvm_mapent_merge(struct vm_map *map, struct vm_map_entry *e1, + struct vm_map_entry *e2, struct uvm_map_deadq *dead) { - vaddr_t hint = *phint; - vaddr_t end; + struct uvm_addr_state *free; -#ifdef PMAP_PREFER /* - * push hint forward as needed to avoid VAC alias problems. - * we only do this if a valid offset is specified. + * Amap of e1 must be extended to include e2. + * e2 contains no real information in its amap, + * so it can be erased immediately. + */ + if (e1->aref.ar_amap) { + if (amap_extend(e1, e2->end - e2->start)) + return NULL; + } + + /* + * Don't drop obj reference: + * uvm_unmap_detach will do this for us. */ - if (uoffset != UVM_UNKNOWN_OFFSET) - hint = PMAP_PREFER(uoffset, hint); -#endif - if (align != 0) - if ((hint & (align - 1)) != 0) - hint = roundup(hint, align); - *phint = hint; - end = hint + length; - if (end > map->max_offset || end < hint) - return (FALSE); - if (after != NULL && after != &map->header && after->start < end) - return (FALSE); - - return (TRUE); + free = uvm_map_uaddr_e(map, e1); + if (free) + uvm_mapent_free_remove(map, free, e1); + + free = uvm_map_uaddr_e(map, e2); + if (free) + uvm_mapent_free_remove(map, free, e2); + uvm_mapent_addr_remove(map, e2); + e1->end = e2->end; + e1->guard = e2->guard; + e1->fspace = e2->fspace; + if (free) + uvm_mapent_free_insert(map, free, e1); + + DEAD_ENTRY_PUSH(dead, e2); + return e1; } /* - * uvm_map_pie: return a random load address for a PIE executable - * properly aligned. + * Attempt forward and backward joining of entry. + * + * Returns entry after joins. + * We are guaranteed that the amap of entry is either non-existant or + * has never been used. */ +struct vm_map_entry* +uvm_mapent_tryjoin(struct vm_map *map, struct vm_map_entry *entry, + struct uvm_map_deadq *dead) +{ + struct vm_map_entry *other; + struct vm_map_entry *merged; -#ifndef VM_PIE_MAX_ADDR -#define VM_PIE_MAX_ADDR (VM_MAXUSER_ADDRESS / 4) -#endif + /* + * Merge with previous entry. + */ + other = RB_PREV(uvm_map_addr, &map->addr, entry); + if (other && uvm_mapent_isjoinable(map, other, entry)) { + merged = uvm_mapent_merge(map, other, entry, dead); + if (merged) + entry = merged; + } -#ifndef VM_PIE_MIN_ADDR -#define VM_PIE_MIN_ADDR VM_MIN_ADDRESS -#endif + /* + * Merge with next entry. + * + * Because amap can only extend forward and the next entry + * probably contains sensible info, only perform forward merging + * in the absence of an amap. + */ + other = RB_NEXT(uvm_map_addr, &map->addr, entry); + if (other && entry->aref.ar_amap == NULL && + other->aref.ar_amap == NULL && + uvm_mapent_isjoinable(map, entry, other)) { + merged = uvm_mapent_merge(map, entry, other, dead); + if (merged) + entry = merged; + } -#ifndef VM_PIE_MIN_ALIGN -#define VM_PIE_MIN_ALIGN PAGE_SIZE -#endif + return entry; +} -vaddr_t -uvm_map_pie(vaddr_t align) +/* + * Kill entries that are no longer in a map. + */ +void +uvm_unmap_detach(struct uvm_map_deadq *deadq, int flags) { - vaddr_t addr, space, min; + struct vm_map_entry *entry; - align = MAX(align, VM_PIE_MIN_ALIGN); + while ((entry = TAILQ_FIRST(deadq)) != NULL) { + /* + * Drop reference to amap, if we've got one. + */ + if (entry->aref.ar_amap) + amap_unref(entry->aref.ar_amap, + entry->aref.ar_pageoff, + atop(entry->end - entry->start), + flags); - /* round up to next alignment */ - min = (VM_PIE_MIN_ADDR + align - 1) & ~(align - 1); - - if (align >= VM_PIE_MAX_ADDR || min >= VM_PIE_MAX_ADDR) - return (align); - - space = (VM_PIE_MAX_ADDR - min) / align; - space = MIN(space, (u_int32_t)-1); - - addr = (vaddr_t)arc4random_uniform((u_int32_t)space) * align; - addr += min; - - return (addr); -} - -/* - * uvm_map_hint: return the beginning of the best area suitable for - * creating a new mapping with "prot" protection. - */ -vaddr_t -uvm_map_hint1(struct proc *p, vm_prot_t prot, int skipheap) -{ - vaddr_t addr; + /* + * Drop reference to our backing object, if we've got one. + */ + if (UVM_ET_ISSUBMAP(entry)) { + /* ... unlikely to happen, but play it safe */ + uvm_map_deallocate(entry->object.sub_map); + } else if (UVM_ET_ISOBJ(entry) && + entry->object.uvm_obj->pgops->pgo_detach) { + entry->object.uvm_obj->pgops->pgo_detach( + entry->object.uvm_obj); + } -#ifdef __i386__ - /* - * If executable skip first two pages, otherwise start - * after data + heap region. - */ - if ((prot & VM_PROT_EXECUTE) && - ((vaddr_t)p->p_vmspace->vm_daddr >= I386_MAX_EXE_ADDR)) { - addr = (PAGE_SIZE*2) + - (arc4random() & (I386_MAX_EXE_ADDR / 2 - 1)); - return (round_page(addr)); + /* + * Step to next. + */ + TAILQ_REMOVE(deadq, entry, dfree.deadq); + uvm_mapent_free(entry); } -#endif - /* start malloc/mmap after the brk */ - addr = (vaddr_t)p->p_vmspace->vm_daddr; - if (skipheap) - addr += BRKSIZ; -#if !defined(__vax__) - addr += arc4random() & (MIN((256 * 1024 * 1024), BRKSIZ) - 1); -#endif - return (round_page(addr)); } /* - * uvm_map_findspace: find "length" sized space in "map". + * Create and insert new entry. * - * => "hint" is a hint about where we want it, unless FINDSPACE_FIXED is - * set (in which case we insist on using "hint"). - * => "result" is VA returned - * => uobj/uoffset are to be used to handle VAC alignment, if required - * => if `align' is non-zero, we attempt to align to that value. - * => caller must at least have read-locked map - * => returns NULL on failure, or pointer to prev. map entry if success - * => note this is a cross between the old vm_map_findspace and vm_map_find + * Returned entry contains new addresses and is inserted properly in the tree. + * first and last are (probably) no longer valid. */ - -struct vm_map_entry * -uvm_map_findspace(struct vm_map *map, vaddr_t hint, vsize_t length, - vaddr_t *result, struct uvm_object *uobj, voff_t uoffset, vsize_t align, - int flags) +struct vm_map_entry* +uvm_map_mkentry(struct vm_map *map, struct vm_map_entry *first, + struct vm_map_entry *last, vaddr_t addr, vsize_t sz, int flags, + struct uvm_map_deadq *dead) { - struct vm_map_entry *entry, *next, *tmp; - struct vm_map_entry *child, *prev = NULL; - vaddr_t end, orig_hint; + struct vm_map_entry *entry, *prev; + struct uvm_addr_state *free; + vaddr_t min, max; /* free space boundaries for new entry */ + + KDASSERT(map != NULL); + KDASSERT(first != NULL); + KDASSERT(last != NULL); + KDASSERT(dead != NULL); + KDASSERT(sz > 0); + KDASSERT(addr + sz > addr); + KDASSERT(first->end <= addr && VMMAP_FREE_END(first) > addr); + KDASSERT(last->start < addr + sz && VMMAP_FREE_END(last) >= addr + sz); + KDASSERT(uvm_map_isavail(map, NULL, &first, &last, addr, sz)); + uvm_tree_sanity(map, __FILE__, __LINE__); + + min = addr + sz; + max = VMMAP_FREE_END(last); - KASSERT((align & (align - 1)) == 0); - KASSERT((flags & UVM_FLAG_FIXED) == 0 || align == 0); + /* + * Initialize new entry. + */ + entry = uvm_mapent_alloc(map, flags); + if (entry == NULL) + return NULL; + entry->offset = 0; + entry->etype = 0; + entry->wired_count = 0; + entry->aref.ar_pageoff = 0; + entry->aref.ar_amap = NULL; - uvm_tree_sanity(map, "map_findspace entry"); + entry->start = addr; + entry->end = min; + entry->guard = 0; + entry->fspace = 0; /* - * remember the original hint. if we are aligning, then we - * may have to try again with no alignment constraint if - * we fail the first time. + * Reset free space in first. */ + free = uvm_map_uaddr_e(map, first); + if (free) + uvm_mapent_free_remove(map, free, first); + first->guard = 0; + first->fspace = 0; - orig_hint = hint; - if (hint < map->min_offset) { /* check ranges ... */ - if (flags & UVM_FLAG_FIXED) { - return(NULL); - } - hint = map->min_offset; - } - if (hint > map->max_offset) { - return(NULL); + /* + * Remove all entries that are fully replaced. + * We are iterating using last in reverse order. + */ + for (; first != last; last = prev) { + prev = RB_PREV(uvm_map_addr, &map->addr, last); + + KDASSERT(last->start == last->end); + free = uvm_map_uaddr_e(map, last); + if (free) + uvm_mapent_free_remove(map, free, last); + uvm_mapent_addr_remove(map, last); + DEAD_ENTRY_PUSH(dead, last); } - /* - * Look for the first possible address; if there's already - * something at this address, we have to start after it. + * Remove first if it is entirely inside . */ - - if ((flags & UVM_FLAG_FIXED) == 0 && hint == map->min_offset) { - if ((entry = map->first_free) != &map->header) - hint = entry->end; + if (first->start == addr) { + uvm_mapent_addr_remove(map, first); + DEAD_ENTRY_PUSH(dead, first); } else { - if (uvm_map_lookup_entry(map, hint, &tmp)) { - /* "hint" address already in use ... */ - if (flags & UVM_FLAG_FIXED) { - return(NULL); - } - hint = tmp->end; - } - entry = tmp; - } - - if (flags & UVM_FLAG_FIXED) { - end = hint + length; - if (end > map->max_offset || end < hint) { - goto error; - } - next = entry->next; - if (next == &map->header || next->start >= end) - goto found; - return(NULL); /* only one shot at it ... */ - } - - /* Try to find the space in the red-black tree */ - - /* Check slot before any entry */ - if (uvm_map_spacefits(map, &hint, length, entry->next, uoffset, align)) - goto found; - - /* If there is not enough space in the whole tree, we fail */ - tmp = RB_ROOT(&map->rbhead); - if (tmp == NULL || tmp->space < length) - goto error; - - /* Find an entry close to hint that has enough space */ - for (; tmp;) { - if (tmp->end >= hint && - (prev == NULL || tmp->end < prev->end)) { - if (tmp->ownspace >= length) - prev = tmp; - else if ((child = RB_RIGHT(tmp, rb_entry)) != NULL && - child->space >= length) - prev = tmp; - } - if (tmp->end < hint) - child = RB_RIGHT(tmp, rb_entry); - else if (tmp->end > hint) - child = RB_LEFT(tmp, rb_entry); - else { - if (tmp->ownspace >= length) - break; - child = RB_RIGHT(tmp, rb_entry); - } - if (child == NULL || child->space < length) - break; - tmp = child; - } - - if (tmp != NULL && hint < tmp->end + tmp->ownspace) { - /* - * Check if the entry that we found satifies the - * space requirement - */ - if (hint < tmp->end) - hint = tmp->end; - if (uvm_map_spacefits(map, &hint, length, tmp->next, uoffset, - align)) { - entry = tmp; - goto found; - } else if (tmp->ownspace >= length) - goto listsearch; - } - if (prev == NULL) - goto error; - - hint = prev->end; - if (uvm_map_spacefits(map, &hint, length, prev->next, uoffset, - align)) { - entry = prev; - goto found; - } else if (prev->ownspace >= length) - goto listsearch; - - tmp = RB_RIGHT(prev, rb_entry); - for (;;) { - KASSERT(tmp && tmp->space >= length); - child = RB_LEFT(tmp, rb_entry); - if (child && child->space >= length) { - tmp = child; - continue; - } - if (tmp->ownspace >= length) - break; - tmp = RB_RIGHT(tmp, rb_entry); - } - - hint = tmp->end; - if (uvm_map_spacefits(map, &hint, length, tmp->next, uoffset, align)) { - entry = tmp; - goto found; + uvm_map_fix_space(map, first, VMMAP_FREE_START(first), + addr, flags); } - /* - * The tree fails to find an entry because of offset or alignment - * restrictions. Search the list instead. - */ - listsearch: /* - * Look through the rest of the map, trying to fit a new region in - * the gap between existing regions, or after the very last region. - * note: entry->end = base VA of current gap, - * next->start = VA of end of current gap + * Finally, link in entry. */ - for (;; hint = (entry = next)->end) { - /* - * Find the end of the proposed new region. Be sure we didn't - * go beyond the end of the map, or wrap around the address; - * if so, we lose. Otherwise, if this is the last entry, or - * if the proposed new region fits before the next entry, we - * win. - */ + uvm_mapent_addr_insert(map, entry); + uvm_map_fix_space(map, entry, min, max, flags); -#ifdef PMAP_PREFER - /* - * push hint forward as needed to avoid VAC alias problems. - * we only do this if a valid offset is specified. - */ - if (uoffset != UVM_UNKNOWN_OFFSET) - hint = PMAP_PREFER(uoffset, hint); -#endif - if (align != 0) { - if ((hint & (align - 1)) != 0) - hint = roundup(hint, align); - /* - * XXX Should we PMAP_PREFER() here again? - */ - } - end = hint + length; - if (end > map->max_offset || end < hint) { - goto error; + uvm_tree_sanity(map, __FILE__, __LINE__); + return entry; +} + +/* + * uvm_mapent_alloc: allocate a map entry + */ +struct vm_map_entry * +uvm_mapent_alloc(struct vm_map *map, int flags) +{ + struct vm_map_entry *me, *ne; + int s, i; + int pool_flags; + + pool_flags = PR_WAITOK; + if (flags & UVM_FLAG_TRYLOCK) + pool_flags = PR_NOWAIT; + + if (map->flags & VM_MAP_INTRSAFE || cold) { + s = splvm(); + simple_lock(&uvm.kentry_lock); + me = uvm.kentry_free; + if (me == NULL) { + ne = km_alloc(PAGE_SIZE, &kv_page, &kp_dirty, + &kd_nowait); + if (ne == NULL) + panic("uvm_mapent_alloc: cannot allocate map " + "entry"); + for (i = 0; + i < PAGE_SIZE / sizeof(struct vm_map_entry) - 1; + i++) + RB_LEFT(&ne[i], daddrs.addr_entry) = &ne[i + 1]; + RB_LEFT(&ne[i], daddrs.addr_entry) = NULL; + me = ne; + if (ratecheck(&uvm_kmapent_last_warn_time, + &uvm_kmapent_warn_rate)) + printf("uvm_mapent_alloc: out of static " + "map entries\n"); } - next = entry->next; - if (next == &map->header || next->start >= end) - break; + uvm.kentry_free = RB_LEFT(me, daddrs.addr_entry); + uvmexp.kmapent++; + simple_unlock(&uvm.kentry_lock); + splx(s); + me->flags = UVM_MAP_STATIC; + } else if (map == kernel_map) { + splassert(IPL_NONE); + me = pool_get(&uvm_map_entry_kmem_pool, pool_flags); + if (me == NULL) + goto out; + me->flags = UVM_MAP_KMEM; + } else { + splassert(IPL_NONE); + me = pool_get(&uvm_map_entry_pool, pool_flags); + if (me == NULL) + goto out; + me->flags = 0; } - found: - SAVE_HINT(map, map->hint, entry); - *result = hint; - return (entry); - error: - if (align != 0) { - return (uvm_map_findspace(map, orig_hint, - length, result, uobj, uoffset, 0, flags)); + if (me != NULL) { + RB_LEFT(me, daddrs.addr_entry) = + RB_RIGHT(me, daddrs.addr_entry) = + RB_PARENT(me, daddrs.addr_entry) = UVMMAP_DEADBEEF; } - return (NULL); -} -/* - * U N M A P - m a i n e n t r y p o i n t - */ +out: + return(me); +} /* - * uvm_unmap: remove mappings from a vm_map (from "start" up to "stop") + * uvm_mapent_free: free map entry * - * => caller must check alignment and size - * => map must be unlocked (we will lock it) + * => XXX: static pool for kernel map? */ void -uvm_unmap_p(vm_map_t map, vaddr_t start, vaddr_t end, struct proc *p) +uvm_mapent_free(struct vm_map_entry *me) { - vm_map_entry_t dead_entries; - - /* - * work now done by helper functions. wipe the pmap's and then - * detach from the dead entries... - */ - vm_map_lock(map); - uvm_unmap_remove(map, start, end, &dead_entries, p, FALSE); - vm_map_unlock(map); - - if (dead_entries != NULL) - uvm_unmap_detach(dead_entries, 0); + int s; + if (me->flags & UVM_MAP_STATIC) { + s = splvm(); + simple_lock(&uvm.kentry_lock); + RB_LEFT(me, daddrs.addr_entry) = uvm.kentry_free; + uvm.kentry_free = me; + uvmexp.kmapent--; + simple_unlock(&uvm.kentry_lock); + splx(s); + } else if (me->flags & UVM_MAP_KMEM) { + splassert(IPL_NONE); + pool_put(&uvm_map_entry_kmem_pool, me); + } else { + splassert(IPL_NONE); + pool_put(&uvm_map_entry_pool, me); + } } - /* - * U N M A P - m a i n h e l p e r f u n c t i o n s + * uvm_map_lookup_entry: find map entry at or before an address. + * + * => map must at least be read-locked by caller + * => entry is returned in "entry" + * => return value is true if address is in the returned entry + * ET_HOLE entries are considered to not contain a mapping, ergo FALSE is + * returned for those mappings. */ +boolean_t +uvm_map_lookup_entry(struct vm_map *map, vaddr_t address, + struct vm_map_entry **entry) +{ + *entry = uvm_map_entrybyaddr(&map->addr, address); + return *entry != NULL && !UVM_ET_ISHOLE(*entry) && + (*entry)->start <= address && (*entry)->end > address; +} /* - * uvm_unmap_remove: remove mappings from a vm_map (from "start" up to "stop") - * - * => caller must check alignment and size - * => map must be locked by caller - * => we return a list of map entries that we've remove from the map - * in "entry_list" + * uvm_map_pie: return a random load address for a PIE executable + * properly aligned. */ +#ifndef VM_PIE_MAX_ADDR +#define VM_PIE_MAX_ADDR (VM_MAXUSER_ADDRESS / 4) +#endif -void -uvm_unmap_remove(struct vm_map *map, vaddr_t start, vaddr_t end, - struct vm_map_entry **entry_list, struct proc *p, boolean_t remove_holes) +#ifndef VM_PIE_MIN_ADDR +#define VM_PIE_MIN_ADDR VM_MIN_ADDRESS +#endif + +#ifndef VM_PIE_MIN_ALIGN +#define VM_PIE_MIN_ALIGN PAGE_SIZE +#endif + +vaddr_t +uvm_map_pie(vaddr_t align) { - struct vm_map_entry *entry, *first_entry, *next; - vaddr_t len; + vaddr_t addr, space, min; + + align = MAX(align, VM_PIE_MIN_ALIGN); - VM_MAP_RANGE_CHECK(map, start, end); + /* round up to next alignment */ + min = (VM_PIE_MIN_ADDR + align - 1) & ~(align - 1); - uvm_tree_sanity(map, "unmap_remove entry"); + if (align >= VM_PIE_MAX_ADDR || min >= VM_PIE_MAX_ADDR) + return (align); - if ((map->flags & VM_MAP_INTRSAFE) == 0) - splassert(IPL_NONE); - else - splassert(IPL_VM); + space = (VM_PIE_MAX_ADDR - min) / align; + space = MIN(space, (u_int32_t)-1); - /* - * find first entry - */ - if (uvm_map_lookup_entry(map, start, &first_entry) == TRUE) { - /* clip and go... */ - entry = first_entry; - UVM_MAP_CLIP_START(map, entry, start); - /* critical! prevents stale hint */ - SAVE_HINT(map, entry, entry->prev); + addr = (vaddr_t)arc4random_uniform((u_int32_t)space) * align; + addr += min; - } else { - entry = first_entry->next; - } + return (addr); +} + +void +uvm_unmap(struct vm_map *map, vaddr_t start, vaddr_t end) +{ + struct uvm_map_deadq dead; + + KASSERT((start & (vaddr_t)PAGE_MASK) == 0 && + (end & (vaddr_t)PAGE_MASK) == 0); + TAILQ_INIT(&dead); + vm_map_lock(map); + uvm_unmap_remove(map, start, end, &dead, FALSE, TRUE); + vm_map_unlock(map); + + uvm_unmap_detach(&dead, 0); +} +/* + * Mark entry as free. + * + * entry will be put on the dead list. + * The free space will be merged into the previous or a new entry, + * unless markfree is false. + */ +void +uvm_mapent_mkfree(struct vm_map *map, struct vm_map_entry *entry, + struct vm_map_entry **prev_ptr, struct uvm_map_deadq *dead, + boolean_t markfree) +{ + struct uvm_addr_state *free; + struct vm_map_entry *prev; + vaddr_t addr; /* Start of freed range. */ + vaddr_t end; /* End of freed range. */ + + prev = *prev_ptr; + if (prev == entry) + *prev_ptr = prev = NULL; + + if (prev == NULL || + VMMAP_FREE_END(prev) != entry->start) + prev = RB_PREV(uvm_map_addr, &map->addr, entry); /* - * Save the free space hint + * Entry is describing only free memory and has nothing to drain into. */ + if (prev == NULL && entry->start == entry->end && markfree) { + *prev_ptr = entry; + return; + } - if (map->first_free->start >= start) - map->first_free = entry->prev; + addr = entry->start; + end = VMMAP_FREE_END(entry); + free = uvm_map_uaddr_e(map, entry); + if (free) + uvm_mapent_free_remove(map, free, entry); + uvm_mapent_addr_remove(map, entry); + DEAD_ENTRY_PUSH(dead, entry); + + if (markfree) { + if (prev) { + free = uvm_map_uaddr_e(map, prev); + if (free) + uvm_mapent_free_remove(map, free, prev); + } + *prev_ptr = uvm_map_fix_space(map, prev, addr, end, 0); + } +} +/* + * Unwire and release referenced amap and object from map entry. + */ +void +uvm_unmap_kill_entry(struct vm_map *map, struct vm_map_entry *entry) +{ /* - * note: we now re-use first_entry for a different task. we remove - * a number of map entries from the map and save them in a linked - * list headed by "first_entry". once we remove them from the map - * the caller should unlock the map and drop the references to the - * backing objects [c.f. uvm_unmap_detach]. the object is to - * separate unmapping from reference dropping. why? - * [1] the map has to be locked for unmapping - * [2] the map need not be locked for reference dropping - * [3] dropping references may trigger pager I/O, and if we hit - * a pager that does synchronous I/O we may have to wait for it. - * [4] we would like all waiting for I/O to occur with maps unlocked - * so that we don't block other threads. + * Unwire removed map entry. */ - first_entry = NULL; - *entry_list = NULL; /* to be safe */ + if (VM_MAPENT_ISWIRED(entry)) { + entry->wired_count = 0; + uvm_fault_unwire_locked(map, entry->start, entry->end); + } /* - * break up the area into map entry sized regions and unmap. note - * that all mappings have to be removed before we can even consider - * dropping references to amaps or VM objects (otherwise we could end - * up with a mapping to a page on the free list which would be very bad) + * Entry-type specific code. */ - - while ((entry != &map->header) && (entry->start < end)) { - - UVM_MAP_CLIP_END(map, entry, end); - next = entry->next; - len = entry->end - entry->start; - if (p && entry->object.uvm_obj == NULL) - p->p_vmspace->vm_dused -= atop(len); + if (UVM_ET_ISHOLE(entry)) { + /* + * Nothing to be done for holes. + */ + } else if (map->flags & VM_MAP_INTRSAFE) { + KASSERT(vm_map_pmap(map) == pmap_kernel()); + uvm_km_pgremove_intrsafe(entry->start, entry->end); + pmap_kremove(entry->start, entry->end - entry->start); + } else if (UVM_ET_ISOBJ(entry) && + UVM_OBJ_IS_KERN_OBJECT(entry->object.uvm_obj)) { + KASSERT(vm_map_pmap(map) == pmap_kernel()); /* - * unwire before removing addresses from the pmap; otherwise - * unwiring will put the entries back into the pmap (XXX). + * Note: kernel object mappings are currently used in + * two ways: + * [1] "normal" mappings of pages in the kernel object + * [2] uvm_km_valloc'd allocations in which we + * pmap_enter in some non-kernel-object page + * (e.g. vmapbuf). + * + * for case [1], we need to remove the mapping from + * the pmap and then remove the page from the kernel + * object (because, once pages in a kernel object are + * unmapped they are no longer needed, unlike, say, + * a vnode where you might want the data to persist + * until flushed out of a queue). + * + * for case [2], we need to remove the mapping from + * the pmap. there shouldn't be any pages at the + * specified offset in the kernel object [but it + * doesn't hurt to call uvm_km_pgremove just to be + * safe?] + * + * uvm_km_pgremove currently does the following: + * for pages in the kernel object range: + * - drops the swap slot + * - uvm_pagefree the page + * + * note there is version of uvm_km_pgremove() that + * is used for "intrsafe" objects. */ - if (VM_MAPENT_ISWIRED(entry)) - uvm_map_entry_unwire(map, entry); + /* + * remove mappings from pmap and drop the pages + * from the object. offsets are always relative + * to vm_map_min(kernel_map). + */ + pmap_remove(pmap_kernel(), entry->start, entry->end); + uvm_km_pgremove(entry->object.uvm_obj, + entry->start - vm_map_min(kernel_map), + entry->end - vm_map_min(kernel_map)); /* - * special case: handle mappings to anonymous kernel objects. - * we want to free these pages right away... + * null out kernel_object reference, we've just + * dropped it */ -#ifdef KVA_GUARDPAGES - if (map == kernel_map && entry->etype & MAP_ET_KVAGUARD) { - entry->etype &= ~MAP_ET_KVAGUARD; - kva_guardpages--; - } else /* (code continues across line-break) */ -#endif - if (UVM_ET_ISHOLE(entry)) { - if (!remove_holes) { - entry = next; - continue; - } - } else if (map->flags & VM_MAP_INTRSAFE) { - uvm_km_pgremove_intrsafe(entry->start, entry->end); - pmap_kremove(entry->start, len); - } else if (UVM_ET_ISOBJ(entry) && - UVM_OBJ_IS_KERN_OBJECT(entry->object.uvm_obj)) { - KASSERT(vm_map_pmap(map) == pmap_kernel()); + entry->etype &= ~UVM_ET_OBJ; + entry->object.uvm_obj = NULL; /* to be safe */ + } else { + /* + * remove mappings the standard way. + */ + pmap_remove(map->pmap, entry->start, entry->end); + } +} - /* - * note: kernel object mappings are currently used in - * two ways: - * [1] "normal" mappings of pages in the kernel object - * [2] uvm_km_valloc'd allocations in which we - * pmap_enter in some non-kernel-object page - * (e.g. vmapbuf). - * - * for case [1], we need to remove the mapping from - * the pmap and then remove the page from the kernel - * object (because, once pages in a kernel object are - * unmapped they are no longer needed, unlike, say, - * a vnode where you might want the data to persist - * until flushed out of a queue). - * - * for case [2], we need to remove the mapping from - * the pmap. there shouldn't be any pages at the - * specified offset in the kernel object [but it - * doesn't hurt to call uvm_km_pgremove just to be - * safe?] - * - * uvm_km_pgremove currently does the following: - * for pages in the kernel object in range: - * - drops the swap slot - * - uvm_pagefree the page - * - * note there is version of uvm_km_pgremove() that - * is used for "intrsafe" objects. - */ - - /* - * remove mappings from pmap and drop the pages - * from the object. offsets are always relative - * to vm_map_min(kernel_map). - */ - pmap_remove(pmap_kernel(), entry->start, entry->end); - uvm_km_pgremove(entry->object.uvm_obj, - entry->start - vm_map_min(kernel_map), - entry->end - vm_map_min(kernel_map)); - - /* - * null out kernel_object reference, we've just - * dropped it - */ - entry->etype &= ~UVM_ET_OBJ; - entry->object.uvm_obj = NULL; /* to be safe */ +/* + * Remove all entries from start to end. + * + * If remove_holes, then remove ET_HOLE entries as well. + * If markfree, entry will be properly marked free, otherwise, no replacement + * entry will be put in the tree (corrupting the tree). + */ +void +uvm_unmap_remove(struct vm_map *map, vaddr_t start, vaddr_t end, + struct uvm_map_deadq *dead, boolean_t remove_holes, + boolean_t markfree) +{ + struct vm_map_entry *prev_hint, *next, *entry; - } else { - /* - * remove mappings the standard way. - */ - pmap_remove(map->pmap, entry->start, entry->end); - } + start = MAX(start, map->min_offset); + end = MIN(end, map->max_offset); + if (start >= end) + return; - /* - * remove entry from map and put it on our list of entries - * that we've nuked. then go do next entry. - */ - /* critical! prevents stale hint */ - SAVE_HINT(map, entry, entry->prev); + if ((map->flags & VM_MAP_INTRSAFE) == 0) + splassert(IPL_NONE); + else + splassert(IPL_VM); - uvm_map_entry_unlink(map, entry); - map->size -= len; - entry->next = first_entry; - first_entry = entry; - entry = next; /* next entry, please */ - } -#ifdef KVA_GUARDPAGES /* - * entry points at the map-entry after the last-removed map-entry. + * Find first affected entry. */ - if (map == kernel_map && entry != &map->header && - entry->etype & MAP_ET_KVAGUARD && entry->start == end) { - /* - * Removed range is followed by guard page; - * remove that guard page now (or it will stay forever). - */ - entry->etype &= ~MAP_ET_KVAGUARD; - kva_guardpages--; - - uvm_map_entry_unlink(map, entry); - map->size -= len; - entry->next = first_entry; - first_entry = entry; - entry = next; /* next entry, please */ - } -#endif - /* if ((map->flags & VM_MAP_DYING) == 0) { */ - pmap_update(vm_map_pmap(map)); - /* } */ - - - uvm_tree_sanity(map, "unmap_remove leave"); + entry = uvm_map_entrybyaddr(&map->addr, start); + KDASSERT(entry != NULL && entry->start <= start); + if (entry->end <= start && markfree) + entry = RB_NEXT(uvm_map_addr, &map->addr, entry); + else + UVM_MAP_CLIP_START(map, entry, start); /* - * now we've cleaned up the map and are ready for the caller to drop - * references to the mapped objects. + * Iterate entries until we reach end address. + * prev_hint hints where the freed space can be appended to. */ + prev_hint = NULL; + for (; entry != NULL && entry->start < end; entry = next) { + KDASSERT(entry->start >= start); + if (entry->end > end || !markfree) + UVM_MAP_CLIP_END(map, entry, end); + KDASSERT(entry->start >= start && entry->end <= end); + next = RB_NEXT(uvm_map_addr, &map->addr, entry); - *entry_list = first_entry; -} - -/* - * uvm_unmap_detach: drop references in a chain of map entries - * - * => we will free the map entries as we traverse the list. - */ - -void -uvm_unmap_detach(struct vm_map_entry *first_entry, int flags) -{ - struct vm_map_entry *next_entry; + /* Don't remove holes unless asked to do so. */ + if (UVM_ET_ISHOLE(entry)) { + if (!remove_holes) { + prev_hint = entry; + continue; + } + } - while (first_entry) { - KASSERT(!VM_MAPENT_ISWIRED(first_entry)); + /* Kill entry. */ + uvm_unmap_kill_entry(map, entry); /* - * drop reference to amap, if we've got one + * Update space usage. */ - - if (first_entry->aref.ar_amap) - uvm_map_unreference_amap(first_entry, flags); + if ((map->flags & VM_MAP_ISVMSPACE) && + entry->object.uvm_obj == NULL && + !UVM_ET_ISHOLE(entry)) { + ((struct vmspace *)map)->vm_dused -= + uvmspace_dused(map, entry->start, entry->end); + } + if (!UVM_ET_ISHOLE(entry)) + map->size -= entry->end - entry->start; /* - * drop reference to our backing object, if we've got one + * Actual removal of entry. */ + uvm_mapent_mkfree(map, entry, &prev_hint, dead, markfree); + } - if (UVM_ET_ISSUBMAP(first_entry)) { - /* ... unlikely to happen, but play it safe */ - uvm_map_deallocate(first_entry->object.sub_map); - } else { - if (UVM_ET_ISOBJ(first_entry) && - first_entry->object.uvm_obj->pgops->pgo_detach) - first_entry->object.uvm_obj->pgops-> - pgo_detach(first_entry->object.uvm_obj); - } + pmap_update(vm_map_pmap(map)); - next_entry = first_entry->next; - uvm_mapent_free(first_entry); - first_entry = next_entry; +#ifdef VMMAP_DEBUG + if (markfree) { + for (entry = uvm_map_entrybyaddr(&map->addr, start); + entry != NULL && entry->start < end; + entry = RB_NEXT(uvm_map_addr, &map->addr, entry)) { + KDASSERT(entry->end <= start || + entry->start == entry->end || + UVM_ET_ISHOLE(entry)); + } + } else { + vaddr_t a; + for (a = start; a < end; a += PAGE_SIZE) + KDASSERT(uvm_map_entrybyaddr(&map->addr, a) == NULL); } +#endif } /* - * E X T R A C T I O N F U N C T I O N S - */ - -/* - * uvm_map_reserve: reserve space in a vm_map for future use. + * Mark all entries from first until end (exclusive) as pageable. * - * => we reserve space in a map by putting a dummy map entry in the - * map (dummy means obj=NULL, amap=NULL, prot=VM_PROT_NONE) - * => map should be unlocked (we will write lock it) - * => we return true if we were able to reserve space - * => XXXCDC: should be inline? + * Lock must be exclusive on entry and will not be touched. */ - -int -uvm_map_reserve(struct vm_map *map, vsize_t size, vaddr_t offset, - vsize_t align, vaddr_t *raddr) +void +uvm_map_pageable_pgon(struct vm_map *map, struct vm_map_entry *first, + struct vm_map_entry *end, vaddr_t start_addr, vaddr_t end_addr) { + struct vm_map_entry *iter; - size = round_page(size); - if (*raddr < vm_map_min(map)) - *raddr = vm_map_min(map); /* hint */ - - /* - * reserve some virtual space. - */ - - if (uvm_map(map, raddr, size, NULL, offset, 0, - UVM_MAPFLAG(UVM_PROT_NONE, UVM_PROT_NONE, UVM_INH_NONE, - UVM_ADV_RANDOM, UVM_FLAG_NOMERGE)) != 0) { - return (FALSE); - } + for (iter = first; iter != end; + iter = RB_NEXT(uvm_map_addr, &map->addr, iter)) { + KDASSERT(iter->start >= start_addr && iter->end <= end_addr); + if (!VM_MAPENT_ISWIRED(iter) || UVM_ET_ISHOLE(iter)) + continue; - return (TRUE); + iter->wired_count = 0; + uvm_fault_unwire_locked(map, iter->start, iter->end); + } } /* - * uvm_map_replace: replace a reserved (blank) area of memory with - * real mappings. + * Mark all entries from first until end (exclusive) as wired. * - * => caller must WRITE-LOCK the map - * => we return TRUE if replacement was a success - * => we expect the newents chain to have nnewents entries on it and - * we expect newents->prev to point to the last entry on the list - * => note newents is allowed to be NULL + * Lockflags determines the lock state on return from this function. + * Lock must be exclusive on entry. */ - int -uvm_map_replace(struct vm_map *map, vaddr_t start, vaddr_t end, - struct vm_map_entry *newents, int nnewents) +uvm_map_pageable_wire(struct vm_map *map, struct vm_map_entry *first, + struct vm_map_entry *end, vaddr_t start_addr, vaddr_t end_addr, + int lockflags) { - struct vm_map_entry *oldent, *last; - - uvm_tree_sanity(map, "map_replace entry"); + struct vm_map_entry *iter; +#ifdef DIAGNOSTIC + unsigned int timestamp_save; +#endif + int error; /* - * first find the blank map entry at the specified address + * Wire pages in two passes: + * + * 1: holding the write lock, we create any anonymous maps that need + * to be created. then we clip each map entry to the region to + * be wired and increment its wiring count. + * + * 2: we downgrade to a read lock, and call uvm_fault_wire to fault + * in the pages for any newly wired area (wired_count == 1). + * + * downgrading to a read lock for uvm_fault_wire avoids a possible + * deadlock with another thread that may have faulted on one of + * the pages to be wired (it would mark the page busy, blocking + * us, then in turn block on the map lock that we hold). + * because we keep the read lock on the map, the copy-on-write + * status of the entries we modify here cannot change. */ + for (iter = first; iter != end; + iter = RB_NEXT(uvm_map_addr, &map->addr, iter)) { + KDASSERT(iter->start >= start_addr && iter->end <= end_addr); + if (UVM_ET_ISHOLE(iter) || iter->start == iter->end) + continue; - if (!uvm_map_lookup_entry(map, start, &oldent)) { - return(FALSE); + /* + * Perform actions of vm_map_lookup that need the write lock. + * - create an anonymous map for copy-on-write + * - anonymous map for zero-fill + * Skip submaps. + */ + if (!VM_MAPENT_ISWIRED(iter) && !UVM_ET_ISSUBMAP(iter) && + UVM_ET_ISNEEDSCOPY(iter) && + ((iter->protection & VM_PROT_WRITE) || + iter->object.uvm_obj == NULL)) { + amap_copy(map, iter, M_WAITOK, TRUE, + iter->start, iter->end); + } + iter->wired_count++; } /* - * check to make sure we have a proper blank entry + * Pass 2. */ - - if (oldent->start != start || oldent->end != end || - oldent->object.uvm_obj != NULL || oldent->aref.ar_amap != NULL) { - return (FALSE); - } - #ifdef DIAGNOSTIC - /* - * sanity check the newents chain - */ - { - struct vm_map_entry *tmpent = newents; - int nent = 0; - vaddr_t cur = start; - - while (tmpent) { - nent++; - if (tmpent->start < cur) - panic("uvm_map_replace1"); - if (tmpent->start > tmpent->end || tmpent->end > end) { - printf("tmpent->start=0x%lx, tmpent->end=0x%lx, end=0x%lx\n", - tmpent->start, tmpent->end, end); - panic("uvm_map_replace2"); - } - cur = tmpent->end; - if (tmpent->next) { - if (tmpent->next->prev != tmpent) - panic("uvm_map_replace3"); - } else { - if (newents->prev != tmpent) - panic("uvm_map_replace4"); - } - tmpent = tmpent->next; - } - if (nent != nnewents) - panic("uvm_map_replace5"); - } + timestamp_save = map->timestamp; #endif + vm_map_busy(map); + vm_map_downgrade(map); - /* - * map entry is a valid blank! replace it. (this does all the - * work of map entry link/unlink...). - */ - - if (newents) { - last = newents->prev; /* we expect this */ - - /* critical: flush stale hints out of map */ - SAVE_HINT(map, map->hint, newents); - if (map->first_free == oldent) - map->first_free = last; - - last->next = oldent->next; - last->next->prev = last; + error = 0; + for (iter = first; error == 0 && iter != end; + iter = RB_NEXT(uvm_map_addr, &map->addr, iter)) { + if (UVM_ET_ISHOLE(iter) || iter->start == iter->end) + continue; - /* Fix RB tree */ - uvm_rb_remove(map, oldent); + error = uvm_fault_wire(map, iter->start, iter->end, + iter->protection); + } - newents->prev = oldent->prev; - newents->prev->next = newents; - map->nentries = map->nentries + (nnewents - 1); + if (error) { + /* + * uvm_fault_wire failure + * + * Reacquire lock and undo our work. + */ + vm_map_upgrade(map); + vm_map_unbusy(map); +#ifdef DIAGNOSTIC + if (timestamp_save != map->timestamp) + panic("uvm_map_pageable_wire: stale map"); +#endif - /* Fixup the RB tree */ - { - int i; - struct vm_map_entry *tmp; + /* + * first is no longer needed to restart loops. + * Use it as iterator to unmap successful mappings. + */ + for (; first != iter; + first = RB_NEXT(uvm_map_addr, &map->addr, first)) { + if (UVM_ET_ISHOLE(first) || first->start == first->end) + continue; - tmp = newents; - for (i = 0; i < nnewents && tmp; i++) { - uvm_rb_insert(map, tmp); - tmp = tmp->next; + first->wired_count--; + if (!VM_MAPENT_ISWIRED(first)) { + uvm_fault_unwire_locked(map, + iter->start, iter->end); } } - } else { - - /* critical: flush stale hints out of map */ - SAVE_HINT(map, map->hint, oldent->prev); - if (map->first_free == oldent) - map->first_free = oldent->prev; - /* NULL list of new entries: just remove the old one */ - uvm_map_entry_unlink(map, oldent); - } + /* + * decrease counter in the rest of the entries + */ + for (; iter != end; + iter = RB_NEXT(uvm_map_addr, &map->addr, iter)) { + if (UVM_ET_ISHOLE(iter) || iter->start == iter->end) + continue; + iter->wired_count--; + } - uvm_tree_sanity(map, "map_replace leave"); + if ((lockflags & UVM_LK_EXIT) == 0) + vm_map_unlock(map); + return error; + } /* - * now we can free the old blank entry, unlock the map and return. + * We are currently holding a read lock. */ - - uvm_mapent_free(oldent); - return(TRUE); + if ((lockflags & UVM_LK_EXIT) == 0) { + vm_map_unbusy(map); + vm_map_unlock_read(map); + } else { + vm_map_upgrade(map); + vm_map_unbusy(map); +#ifdef DIAGNOSTIC + if (timestamp_save != map->timestamp) + panic("uvm_map_pageable_wire: stale map"); +#endif + } + return 0; } /* - * uvm_map_extract: extract a mapping from a map and put it somewhere - * (maybe removing the old mapping) + * uvm_map_pageable: set pageability of a range in a map. * - * => maps should be unlocked (we will write lock them) - * => returns 0 on success, error code otherwise - * => start must be page aligned - * => len must be page sized - * => flags: - * UVM_EXTRACT_REMOVE: remove mappings from srcmap - * UVM_EXTRACT_CONTIG: abort if unmapped area (advisory only) - * UVM_EXTRACT_QREF: for a temporary extraction do quick obj refs - * UVM_EXTRACT_FIXPROT: set prot to maxprot as we go - * >>>NOTE: if you set REMOVE, you are not allowed to use CONTIG or QREF!<<< - * >>>NOTE: QREF's must be unmapped via the QREF path, thus should only - * be used from within the kernel in a kernel level map <<< + * Flags: + * UVM_LK_ENTER: map is already locked by caller + * UVM_LK_EXIT: don't unlock map on exit + * + * The full range must be in use (entries may not have fspace != 0). + * UVM_ET_HOLE counts as unmapped. */ - int -uvm_map_extract(struct vm_map *srcmap, vaddr_t start, vsize_t len, - struct vm_map *dstmap, vaddr_t *dstaddrp, int flags) +uvm_map_pageable(struct vm_map *map, vaddr_t start, vaddr_t end, + boolean_t new_pageable, int lockflags) { - vaddr_t dstaddr, end, newend, oldoffset, fudge, orig_fudge, - oldstart; - struct vm_map_entry *chain, *endchain, *entry, *orig_entry, *newentry; - struct vm_map_entry *deadentry, *oldentry; - vsize_t elen; - int nchain, error, copy_ok; + struct vm_map_entry *first, *last, *tmp; + int error; - uvm_tree_sanity(srcmap, "map_extract src enter"); - uvm_tree_sanity(dstmap, "map_extract dst enter"); + start = trunc_page(start); + end = round_page(end); - /* - * step 0: sanity check: start must be on a page boundary, length - * must be page sized. can't ask for CONTIG/QREF if you asked for - * REMOVE. - */ + if (start > end) + return EINVAL; + if (start < map->min_offset) + return EFAULT; /* why? see first XXX below */ + if (end > map->max_offset) + return EINVAL; /* why? see second XXX below */ - KASSERT((start & PAGE_MASK) == 0 && (len & PAGE_MASK) == 0); - KASSERT((flags & UVM_EXTRACT_REMOVE) == 0 || - (flags & (UVM_EXTRACT_CONTIG|UVM_EXTRACT_QREF)) == 0); + KASSERT(map->flags & VM_MAP_PAGEABLE); + if ((lockflags & UVM_LK_ENTER) == 0) + vm_map_lock(map); /* - * step 1: reserve space in the target map for the extracted area + * Find first entry. + * + * Initial test on start is different, because of the different + * error returned. Rest is tested further down. */ - - dstaddr = vm_map_min(dstmap); - if (uvm_map_reserve(dstmap, len, start, 0, &dstaddr) == FALSE) - return(ENOMEM); - *dstaddrp = dstaddr; /* pass address back to caller */ + first = uvm_map_entrybyaddr(&map->addr, start); + if (first->end <= start || UVM_ET_ISHOLE(first)) { + /* + * XXX if the first address is not mapped, it is EFAULT? + */ + error = EFAULT; + goto out; + } /* - * step 2: setup for the extraction process loop by init'ing the - * map entry chain, locking src map, and looking up the first useful - * entry in the map. + * Check that the range has no holes. */ - - end = start + len; - newend = dstaddr + len; - chain = endchain = NULL; - nchain = 0; - vm_map_lock(srcmap); - - if (uvm_map_lookup_entry(srcmap, start, &entry)) { - - /* "start" is within an entry */ - if (flags & UVM_EXTRACT_QREF) { - - /* - * for quick references we don't clip the entry, so - * the entry may map space "before" the starting - * virtual address... this is the "fudge" factor - * (which can be non-zero only the first time - * through the "while" loop in step 3). - */ - - fudge = start - entry->start; - } else { - + for (last = first; last != NULL && last->start < end; + last = RB_NEXT(uvm_map_addr, &map->addr, last)) { + if (UVM_ET_ISHOLE(last) || + (last->end < end && VMMAP_FREE_END(last) != last->end)) { /* - * normal reference: we clip the map to fit (thus - * fudge is zero) + * XXX unmapped memory in range, why is it EINVAL + * instead of EFAULT? */ - - UVM_MAP_CLIP_START(srcmap, entry, start); - SAVE_HINT(srcmap, srcmap->hint, entry->prev); - fudge = 0; - } - } else { - - /* "start" is not within an entry ... skip to next entry */ - if (flags & UVM_EXTRACT_CONTIG) { error = EINVAL; - goto bad; /* definite hole here ... */ + goto out; } - - entry = entry->next; - fudge = 0; } - /* save values from srcmap for step 6 */ - orig_entry = entry; - orig_fudge = fudge; - /* - * step 3: now start looping through the map entries, extracting - * as we go. + * Last ended at the first entry after the range. + * Move back one step. + * + * Note that last may be NULL. */ + if (last == NULL) { + last = RB_MAX(uvm_map_addr, &map->addr); + if (last->end < end) { + error = EINVAL; + goto out; + } + } else + last = RB_PREV(uvm_map_addr, &map->addr, last); - while (entry->start < end && entry != &srcmap->header) { - - /* if we are not doing a quick reference, clip it */ - if ((flags & UVM_EXTRACT_QREF) == 0) - UVM_MAP_CLIP_END(srcmap, entry, end); + /* + * Wire/unwire pages here. + */ + if (new_pageable) { + /* + * Mark pageable. + * entries that are not wired are untouched. + */ + if (VM_MAPENT_ISWIRED(first)) + UVM_MAP_CLIP_START(map, first, start); + /* + * Split last at end. + * Make tmp be the first entry after what is to be touched. + * If last is not wired, don't touch it. + */ + if (VM_MAPENT_ISWIRED(last)) { + UVM_MAP_CLIP_END(map, last, end); + tmp = RB_NEXT(uvm_map_addr, &map->addr, last); + } else + tmp = last; - /* clear needs_copy (allow chunking) */ - if (UVM_ET_ISNEEDSCOPY(entry)) { - if (fudge) - oldstart = entry->start; - else - oldstart = 0; /* XXX: gcc */ - amap_copy(srcmap, entry, M_NOWAIT, TRUE, start, end); - if (UVM_ET_ISNEEDSCOPY(entry)) { /* failed? */ - error = ENOMEM; - goto bad; - } + uvm_map_pageable_pgon(map, first, tmp, start, end); + error = 0; - /* amap_copy could clip (during chunk)! update fudge */ - if (fudge) { - fudge = fudge - (entry->start - oldstart); - orig_fudge = fudge; - } - } +out: + if ((lockflags & UVM_LK_EXIT) == 0) + vm_map_unlock(map); + return error; + } else { + /* + * Mark entries wired. + * entries are always touched (because recovery needs this). + */ + if (!VM_MAPENT_ISWIRED(first)) + UVM_MAP_CLIP_START(map, first, start); + /* + * Split last at end. + * Make tmp be the first entry after what is to be touched. + * If last is not wired, don't touch it. + */ + if (!VM_MAPENT_ISWIRED(last)) { + UVM_MAP_CLIP_END(map, last, end); + tmp = RB_NEXT(uvm_map_addr, &map->addr, last); + } else + tmp = last; + + return uvm_map_pageable_wire(map, first, tmp, start, end, + lockflags); + } +} - /* calculate the offset of this from "start" */ - oldoffset = (entry->start + fudge) - start; +/* + * uvm_map_pageable_all: special case of uvm_map_pageable - affects + * all mapped regions. + * + * Map must not be locked. + * If no flags are specified, all ragions are unwired. + */ +int +uvm_map_pageable_all(struct vm_map *map, int flags, vsize_t limit) +{ + vsize_t size; + struct vm_map_entry *iter; - /* allocate a new map entry */ - newentry = uvm_mapent_alloc(dstmap, flags); - if (newentry == NULL) { - error = ENOMEM; - goto bad; - } + KASSERT(map->flags & VM_MAP_PAGEABLE); + vm_map_lock(map); - /* set up new map entry */ - newentry->next = NULL; - newentry->prev = endchain; - newentry->start = dstaddr + oldoffset; - newentry->end = - newentry->start + (entry->end - (entry->start + fudge)); - if (newentry->end > newend || newentry->end < newentry->start) - newentry->end = newend; - newentry->object.uvm_obj = entry->object.uvm_obj; - if (newentry->object.uvm_obj) { - if (newentry->object.uvm_obj->pgops->pgo_reference) - newentry->object.uvm_obj->pgops-> - pgo_reference(newentry->object.uvm_obj); - newentry->offset = entry->offset + fudge; - } else { - newentry->offset = 0; - } - newentry->etype = entry->etype; - newentry->protection = (flags & UVM_EXTRACT_FIXPROT) ? - entry->max_protection : entry->protection; - newentry->max_protection = entry->max_protection; - newentry->inheritance = entry->inheritance; - newentry->wired_count = 0; - newentry->aref.ar_amap = entry->aref.ar_amap; - if (newentry->aref.ar_amap) { - newentry->aref.ar_pageoff = - entry->aref.ar_pageoff + (fudge >> PAGE_SHIFT); - uvm_map_reference_amap(newentry, AMAP_SHARED | - ((flags & UVM_EXTRACT_QREF) ? AMAP_REFALL : 0)); - } else { - newentry->aref.ar_pageoff = 0; - } - newentry->advice = entry->advice; + if (flags == 0) { + uvm_map_pageable_pgon(map, RB_MIN(uvm_map_addr, &map->addr), + NULL, map->min_offset, map->max_offset); - /* now link it on the chain */ - nchain++; - if (endchain == NULL) { - chain = endchain = newentry; - } else { - endchain->next = newentry; - endchain = newentry; - } + atomic_clearbits_int(&map->flags, VM_MAP_WIREFUTURE); + vm_map_unlock(map); + return 0; + } - /* end of 'while' loop! */ - if ((flags & UVM_EXTRACT_CONTIG) && entry->end < end && - (entry->next == &srcmap->header || - entry->next->start != entry->end)) { - error = EINVAL; - goto bad; - } - entry = entry->next; - fudge = 0; + if (flags & MCL_FUTURE) + atomic_setbits_int(&map->flags, VM_MAP_WIREFUTURE); + if (!(flags & MCL_CURRENT)) { + vm_map_unlock(map); + return 0; } /* - * step 4: close off chain (in format expected by uvm_map_replace) + * Count number of pages in all non-wired entries. + * If the number exceeds the limit, abort. */ + size = 0; + RB_FOREACH(iter, uvm_map_addr, &map->addr) { + if (VM_MAPENT_ISWIRED(iter) || UVM_ET_ISHOLE(iter)) + continue; - if (chain) - chain->prev = endchain; + size += iter->end - iter->start; + } - /* - * step 5: attempt to lock the dest map so we can pmap_copy. - * note usage of copy_ok: - * 1 => dstmap locked, pmap_copy ok, and we "replace" here (step 5) - * 0 => dstmap unlocked, NO pmap_copy, and we will "replace" in step 7 - */ + if (atop(size) + uvmexp.wired > uvmexp.wiredmax) { + vm_map_unlock(map); + return ENOMEM; + } - if (srcmap == dstmap || vm_map_lock_try(dstmap) == TRUE) { - copy_ok = 1; - if (!uvm_map_replace(dstmap, dstaddr, dstaddr+len, chain, - nchain)) { - if (srcmap != dstmap) - vm_map_unlock(dstmap); - error = EIO; - goto bad; - } - } else { - copy_ok = 0; - /* replace defered until step 7 */ + /* XXX non-pmap_wired_count case must be handled by caller */ +#ifdef pmap_wired_count + if (limit != 0 && + size + ptoa(pmap_wired_count(vm_map_pmap(map))) > limit) { + vm_map_unlock(map); + return ENOMEM; } +#endif /* - * step 6: traverse the srcmap a second time to do the following: - * - if we got a lock on the dstmap do pmap_copy - * - if UVM_EXTRACT_REMOVE remove the entries - * we make use of orig_entry and orig_fudge (saved in step 2) + * uvm_map_pageable_wire will release lcok */ + return uvm_map_pageable_wire(map, RB_MIN(uvm_map_addr, &map->addr), + NULL, map->min_offset, map->max_offset, 0); +} - if (copy_ok || (flags & UVM_EXTRACT_REMOVE)) { - - /* purge possible stale hints from srcmap */ - if (flags & UVM_EXTRACT_REMOVE) { - SAVE_HINT(srcmap, srcmap->hint, orig_entry->prev); - if (srcmap->first_free->start >= start) - srcmap->first_free = orig_entry->prev; - } - - entry = orig_entry; - fudge = orig_fudge; - deadentry = NULL; /* for UVM_EXTRACT_REMOVE */ - - while (entry->start < end && entry != &srcmap->header) { - if (copy_ok) { - oldoffset = (entry->start + fudge) - start; - elen = MIN(end, entry->end) - - (entry->start + fudge); - pmap_copy(dstmap->pmap, srcmap->pmap, - dstaddr + oldoffset, elen, - entry->start + fudge); - } - - /* we advance "entry" in the following if statement */ - if (flags & UVM_EXTRACT_REMOVE) { - pmap_remove(srcmap->pmap, entry->start, - entry->end); - oldentry = entry; /* save entry */ - entry = entry->next; /* advance */ - uvm_map_entry_unlink(srcmap, oldentry); - /* add to dead list */ - oldentry->next = deadentry; - deadentry = oldentry; - } else { - entry = entry->next; /* advance */ - } - - /* end of 'while' loop */ - fudge = 0; - } - pmap_update(srcmap->pmap); - - /* - * unlock dstmap. we will dispose of deadentry in - * step 7 if needed - */ - - if (copy_ok && srcmap != dstmap) - vm_map_unlock(dstmap); +/* + * Initialize map. + * + * Allocates sufficient entries to describe the free memory in the map. + */ +void +uvm_map_setup(struct vm_map *map, vaddr_t min, vaddr_t max, int flags) +{ + int i; - } - else - deadentry = NULL; /* XXX: gcc */ + KASSERT((min & (vaddr_t)PAGE_MASK) == 0); + KASSERT((max & (vaddr_t)PAGE_MASK) == 0 || + (max & (vaddr_t)PAGE_MASK) == (vaddr_t)PAGE_MASK); /* - * step 7: we are done with the source map, unlock. if copy_ok - * is 0 then we have not replaced the dummy mapping in dstmap yet - * and we need to do so now. + * Update parameters. + * + * This code handles (vaddr_t)-1 and other page mask ending addresses + * properly. + * We lose the top page if the full virtual address space is used. */ - - vm_map_unlock(srcmap); - if ((flags & UVM_EXTRACT_REMOVE) && deadentry) - uvm_unmap_detach(deadentry, 0); /* dispose of old entries */ - - /* now do the replacement if we didn't do it in step 5 */ - if (copy_ok == 0) { - vm_map_lock(dstmap); - error = uvm_map_replace(dstmap, dstaddr, dstaddr+len, chain, - nchain); - vm_map_unlock(dstmap); - - if (error == FALSE) { - error = EIO; - goto bad2; - } + if (max & (vaddr_t)PAGE_MASK) { + max += 1; + if (max == 0) /* overflow */ + max -= PAGE_SIZE; } - uvm_tree_sanity(srcmap, "map_extract src leave"); - uvm_tree_sanity(dstmap, "map_extract dst leave"); + RB_INIT(&map->addr); + map->uaddr_exe = NULL; + for (i = 0; i < nitems(map->uaddr_any); ++i) + map->uaddr_any[i] = NULL; + map->uaddr_brk_stack = NULL; + + map->size = 0; + map->ref_count = 1; + map->min_offset = min; + map->max_offset = max; + map->b_start = map->b_end = 0; /* Empty brk() area by default. */ + map->s_start = map->s_end = 0; /* Empty stack area by default. */ + map->flags = flags; + map->timestamp = 0; + rw_init(&map->lock, "vmmaplk"); + simple_lock_init(&map->ref_lock); - return(0); + /* + * Ensure the selectors will not try to manage page 0; + * it's too special. + */ + if (min < VMMAP_MIN_ADDR) + min = VMMAP_MIN_ADDR; /* - * bad: failure recovery + * Configure the allocators. */ -bad: - vm_map_unlock(srcmap); -bad2: /* src already unlocked */ - if (chain) - uvm_unmap_detach(chain, - (flags & UVM_EXTRACT_QREF) ? AMAP_REFALL : 0); + if (flags & VM_MAP_ISVMSPACE) { + /* + * Setup hint areas. + */ +#if 0 /* Don't use the cool stuff yet. */ +#ifdef __LP64__ + /* Hinted allocations above 4GB */ + map->uaddr_any[0] = + uaddr_hint_create(0x100000000ULL, max, 1024 * 1024 * 1024); + /* Hinted allocations below 4GB */ + map->uaddr_any[1] = + uaddr_hint_create(MAX(min, VMMAP_MIN_ADDR), 0x100000000ULL, + 1024 * 1024 * 1024); +#else + map->uaddr_any[1] = + uaddr_hint_create(MAX(min, VMMAP_MIN_ADDR), max, + 1024 * 1024 * 1024); +#endif - uvm_tree_sanity(srcmap, "map_extract src err leave"); - uvm_tree_sanity(dstmap, "map_extract dst err leave"); +#ifdef __i386__ + map->uaddr_exe = uaddr_rnd_create(min, I386_MAX_EXE_ADDR); + map->uaddr_any[3] = uaddr_pivot_create(2 * I386_MAX_EXE_ADDR, + max); +#elif defined(__LP64__) + map->uaddr_any[3] = + uaddr_pivot_create(MAX(min, 0x100000000ULL), max); +#else + map->uaddr_any[3] = uaddr_pivot_create(min, max); +#endif +#else /* Don't use the cool stuff yet. */ + /* + * Use the really crappy stuff at first commit. + * Browsers like crappy stuff. + */ + map->uaddr_any[0] = uaddr_rnd_create(min, max); +#endif + map->uaddr_brk_stack = uaddr_stack_brk_create(min, max); + } else + map->uaddr_any[3] = &uaddr_kbootstrap; - uvm_unmap(dstmap, dstaddr, dstaddr+len); /* ??? */ - return(error); + /* + * Fill map entries. + * This requires a write-locked map (because of diagnostic assertions + * in insert code). + */ + if ((map->flags & VM_MAP_INTRSAFE) == 0) { + if (rw_enter(&map->lock, RW_NOSLEEP|RW_WRITE) != 0) + panic("uvm_map_setup: rw_enter failed on new map"); + } + uvm_map_setup_entries(map); + uvm_tree_sanity(map, __FILE__, __LINE__); + if ((map->flags & VM_MAP_INTRSAFE) == 0) + rw_exit(&map->lock); } -/* end of extraction functions */ - /* - * uvm_map_submap: punch down part of a map into a submap + * Destroy the map. * - * => only the kernel_map is allowed to be submapped - * => the purpose of submapping is to break up the locking granularity - * of a larger map - * => the range specified must have been mapped previously with a uvm_map() - * call [with uobj==NULL] to create a blank map entry in the main map. - * [And it had better still be blank!] - * => maps which contain submaps should never be copied or forked. - * => to remove a submap, use uvm_unmap() on the main map - * and then uvm_map_deallocate() the submap. - * => main map must be unlocked. - * => submap must have been init'd and have a zero reference count. - * [need not be locked as we don't actually reference it] + * This is the inverse operation to uvm_map_setup. */ - -int -uvm_map_submap(struct vm_map *map, vaddr_t start, vaddr_t end, - struct vm_map *submap) +void +uvm_map_teardown(struct vm_map *map) { - struct vm_map_entry *entry; - int result; - - vm_map_lock(map); + struct uvm_map_deadq dead_entries; + int i; + struct vm_map_entry *entry, *tmp; +#ifdef VMMAP_DEBUG + size_t numq, numt; +#endif - VM_MAP_RANGE_CHECK(map, start, end); + if ((map->flags & VM_MAP_INTRSAFE) == 0) { + if (rw_enter(&map->lock, RW_NOSLEEP | RW_WRITE) != 0) + panic("uvm_map_teardown: rw_enter failed on free map"); + } - if (uvm_map_lookup_entry(map, start, &entry)) { - UVM_MAP_CLIP_START(map, entry, start); - UVM_MAP_CLIP_END(map, entry, end); /* to be safe */ - } else { - entry = NULL; + /* + * Remove address selectors. + */ + uvm_addr_destroy(map->uaddr_exe); + map->uaddr_exe = NULL; + for (i = 0; i < nitems(map->uaddr_any); i++) { + uvm_addr_destroy(map->uaddr_any[i]); + map->uaddr_any[i] = NULL; } + uvm_addr_destroy(map->uaddr_brk_stack); + map->uaddr_brk_stack = NULL; - if (entry != NULL && - entry->start == start && entry->end == end && - entry->object.uvm_obj == NULL && entry->aref.ar_amap == NULL && - !UVM_ET_ISCOPYONWRITE(entry) && !UVM_ET_ISNEEDSCOPY(entry)) { - entry->etype |= UVM_ET_SUBMAP; - entry->object.sub_map = submap; - entry->offset = 0; - uvm_map_reference(submap); - result = 0; - } else { - result = EINVAL; + /* + * Remove entries. + * + * The following is based on graph breadth-first search. + * + * In color terms: + * - the dead_entries set contains all nodes that are reachable + * (i.e. both the black and the grey nodes) + * - any entry not in dead_entries is white + * - any entry that appears in dead_entries before entry, + * is black, the rest is grey. + * The set [entry, end] is also referred to as the wavefront. + * + * Since the tree is always a fully connected graph, the breadth-first + * search guarantees that each vmmap_entry is visited exactly once. + * The vm_map is broken down in linear time. + */ + TAILQ_INIT(&dead_entries); + if ((entry = RB_ROOT(&map->addr)) != NULL) + DEAD_ENTRY_PUSH(&dead_entries, entry); + while (entry != NULL) { + uvm_unmap_kill_entry(map, entry); + if ((tmp = RB_LEFT(entry, daddrs.addr_entry)) != NULL) + DEAD_ENTRY_PUSH(&dead_entries, tmp); + if ((tmp = RB_RIGHT(entry, daddrs.addr_entry)) != NULL) + DEAD_ENTRY_PUSH(&dead_entries, tmp); + /* Update wave-front. */ + entry = TAILQ_NEXT(entry, dfree.deadq); } - vm_map_unlock(map); - return(result); -} + if ((map->flags & VM_MAP_INTRSAFE) == 0) + rw_exit(&map->lock); + +#ifdef VMMAP_DEBUG + numt = numq = 0; + RB_FOREACH(entry, uvm_map_addr, &map->addr) + numt++; + TAILQ_FOREACH(entry, &dead_entries, dfree.deadq) + numq++; + KASSERT(numt == numq); +#endif + uvm_unmap_detach(&dead_entries, 0); + pmap_destroy(map->pmap); + map->pmap = NULL; +} /* - * uvm_map_protect: change map protection + * Populate map with free-memory entries. * - * => set_max means set max_protection. - * => map must be unlocked. + * Map must be initialized and empty. */ +void +uvm_map_setup_entries(struct vm_map *map) +{ + KDASSERT(RB_EMPTY(&map->addr)); -#define MASK(entry) (UVM_ET_ISCOPYONWRITE(entry) ? \ - ~VM_PROT_WRITE : VM_PROT_ALL) -#define max(a,b) ((a) > (b) ? (a) : (b)) + uvm_map_fix_space(map, NULL, map->min_offset, map->max_offset, 0); +} -int -uvm_map_protect(struct vm_map *map, vaddr_t start, vaddr_t end, - vm_prot_t new_prot, boolean_t set_max) +/* + * Split entry at given address. + * + * orig: entry that is to be split. + * next: a newly allocated map entry that is not linked. + * split: address at which the split is done. + */ +void +uvm_map_splitentry(struct vm_map *map, struct vm_map_entry *orig, + struct vm_map_entry *next, vaddr_t split) { - struct vm_map_entry *current, *entry; - int error = 0; + struct uvm_addr_state *free, *free_before; + vsize_t adj; - vm_map_lock(map); + if ((split & PAGE_MASK) != 0) { + panic("uvm_map_splitentry: split address 0x%lx " + "not on page boundary!", split); + } + KDASSERT(map != NULL && orig != NULL && next != NULL); + uvm_tree_sanity(map, __FILE__, __LINE__); + KASSERT(orig->start < split && VMMAP_FREE_END(orig) > split); - VM_MAP_RANGE_CHECK(map, start, end); +#ifdef VMMAP_DEBUG + KDASSERT(RB_FIND(uvm_map_addr, &map->addr, orig) == orig); + KDASSERT(RB_FIND(uvm_map_addr, &map->addr, next) != next); +#endif /* VMMAP_DEBUG */ - if (uvm_map_lookup_entry(map, start, &entry)) { - UVM_MAP_CLIP_START(map, entry, start); + /* + * Free space will change, unlink from free space tree. + */ + free = uvm_map_uaddr_e(map, orig); + if (free) + uvm_mapent_free_remove(map, free, orig); + + adj = split - orig->start; + + uvm_mapent_copy(orig, next); + if (split >= orig->end) { + next->etype = 0; + next->offset = 0; + next->wired_count = 0; + next->start = next->end = split; + next->guard = 0; + next->fspace = VMMAP_FREE_END(orig) - split; + next->aref.ar_amap = NULL; + next->aref.ar_pageoff = 0; + orig->guard = MIN(orig->guard, split - orig->end); + orig->fspace = split - VMMAP_FREE_START(orig); } else { - entry = entry->next; + orig->fspace = 0; + orig->guard = 0; + orig->end = next->start = split; + + if (next->aref.ar_amap) + amap_splitref(&orig->aref, &next->aref, adj); + if (UVM_ET_ISSUBMAP(orig)) { + uvm_map_reference(next->object.sub_map); + next->offset += adj; + } else if (UVM_ET_ISOBJ(orig)) { + if (next->object.uvm_obj->pgops && + next->object.uvm_obj->pgops->pgo_reference) { + next->object.uvm_obj->pgops->pgo_reference( + next->object.uvm_obj); + } + next->offset += adj; + } } /* - * make a first pass to check for protection violations. + * Link next into address tree. + * Link orig and next into free-space tree. + * + * Don't insert 'next' into the addr tree until orig has been linked, + * in case the free-list looks at adjecent entries in the addr tree + * for its decisions. */ + if (orig->fspace > 0) + free_before = free; + else + free_before = uvm_map_uaddr_e(map, orig); + if (free_before) + uvm_mapent_free_insert(map, free_before, orig); + uvm_mapent_addr_insert(map, next); + if (free) + uvm_mapent_free_insert(map, free, next); + + uvm_tree_sanity(map, __FILE__, __LINE__); +} - current = entry; - while ((current != &map->header) && (current->start < end)) { - if (UVM_ET_ISSUBMAP(current)) { - error = EINVAL; - goto out; - } - if ((new_prot & current->max_protection) != new_prot) { - error = EACCES; - goto out; - } - current = current->next; - } - /* go back and fix up protections (no need to clip this time). */ +#ifdef VMMAP_DEBUG - current = entry; +void +uvm_tree_assert(struct vm_map *map, int test, char *test_str, + char *file, int line) +{ + char* map_special; - while ((current != &map->header) && (current->start < end)) { - vm_prot_t old_prot; + if (test) + return; - UVM_MAP_CLIP_END(map, current, end); + if (map == kernel_map) + map_special = " (kernel_map)"; + else if (map == kmem_map) + map_special = " (kmem_map)"; + else + map_special = ""; + panic("uvm_tree_sanity %p%s (%s %d): %s", map, map_special, file, + line, test_str); +} - old_prot = current->protection; - if (set_max) - current->protection = - (current->max_protection = new_prot) & old_prot; - else - current->protection = new_prot; +/* + * Check that map is sane. + */ +void +uvm_tree_sanity(struct vm_map *map, char *file, int line) +{ + struct vm_map_entry *iter; + vaddr_t addr; + vaddr_t min, max, bound; /* Bounds checker. */ + struct uvm_addr_state *free; + addr = vm_map_min(map); + RB_FOREACH(iter, uvm_map_addr, &map->addr) { /* - * update physical map if necessary. worry about copy-on-write - * here -- CHECK THIS XXX + * Valid start, end. + * Catch overflow for end+fspace. + */ + UVM_ASSERT(map, iter->end >= iter->start, file, line); + UVM_ASSERT(map, VMMAP_FREE_END(iter) >= iter->end, file, line); + /* + * May not be empty. */ + UVM_ASSERT(map, iter->start < VMMAP_FREE_END(iter), + file, line); - if (current->protection != old_prot) { - /* update pmap! */ - if ((current->protection & MASK(entry)) == PROT_NONE && - VM_MAPENT_ISWIRED(entry)) - current->wired_count--; - pmap_protect(map->pmap, current->start, current->end, - current->protection & MASK(entry)); - } + /* + * Addresses for entry must lie within map boundaries. + */ + UVM_ASSERT(map, iter->start >= vm_map_min(map) && + VMMAP_FREE_END(iter) <= vm_map_max(map), file, line); /* - * If the map is configured to lock any future mappings, - * wire this entry now if the old protection was VM_PROT_NONE - * and the new protection is not VM_PROT_NONE. + * Tree may not have gaps. */ + UVM_ASSERT(map, iter->start == addr, file, line); + addr = VMMAP_FREE_END(iter); - if ((map->flags & VM_MAP_WIREFUTURE) != 0 && - VM_MAPENT_ISWIRED(entry) == 0 && - old_prot == VM_PROT_NONE && - new_prot != VM_PROT_NONE) { - if (uvm_map_pageable(map, entry->start, entry->end, - FALSE, UVM_LK_ENTER|UVM_LK_EXIT) != 0) { - /* - * If locking the entry fails, remember the - * error if it's the first one. Note we - * still continue setting the protection in - * the map, but will return the resource - * shortage condition regardless. - * - * XXX Ignore what the actual error is, - * XXX just call it a resource shortage - * XXX so that it doesn't get confused - * XXX what uvm_map_protect() itself would - * XXX normally return. - */ - error = ENOMEM; - } + /* + * Free space may not cross boundaries, unless the same + * free list is used on both sides of the border. + */ + min = VMMAP_FREE_START(iter); + max = VMMAP_FREE_END(iter); + + while (min < max && + (bound = uvm_map_boundary(map, min, max)) != max) { + UVM_ASSERT(map, + uvm_map_uaddr(map, bound - 1) == + uvm_map_uaddr(map, bound), + file, line); + min = bound; } - current = current->next; + free = uvm_map_uaddr_e(map, iter); + if (free) { + UVM_ASSERT(map, (iter->etype & UVM_ET_FREEMAPPED) != 0, + file, line); + } else { + UVM_ASSERT(map, (iter->etype & UVM_ET_FREEMAPPED) == 0, + file, line); + } } - pmap_update(map->pmap); - - out: - vm_map_unlock(map); - return (error); + UVM_ASSERT(map, addr == vm_map_max(map), file, line); } -#undef max -#undef MASK - -/* - * uvm_map_inherit: set inheritance code for range of addrs in map. - * - * => map must be unlocked - * => note that the inherit code is used during a "fork". see fork - * code for details. - */ - -int -uvm_map_inherit(struct vm_map *map, vaddr_t start, vaddr_t end, - vm_inherit_t new_inheritance) +void +uvm_tree_size_chk(struct vm_map *map, char *file, int line) { - struct vm_map_entry *entry; - - switch (new_inheritance) { - case MAP_INHERIT_NONE: - case MAP_INHERIT_COPY: - case MAP_INHERIT_SHARE: - break; - default: - return (EINVAL); - } + struct vm_map_entry *iter; + vsize_t size; - vm_map_lock(map); - - VM_MAP_RANGE_CHECK(map, start, end); - - if (uvm_map_lookup_entry(map, start, &entry)) { - UVM_MAP_CLIP_START(map, entry, start); - } else { - entry = entry->next; + size = 0; + RB_FOREACH(iter, uvm_map_addr, &map->addr) { + if (!UVM_ET_ISHOLE(iter)) + size += iter->end - iter->start; } - while ((entry != &map->header) && (entry->start < end)) { - UVM_MAP_CLIP_END(map, entry, end); - entry->inheritance = new_inheritance; - entry = entry->next; - } + if (map->size != size) + printf("map size = 0x%lx, should be 0x%lx\n", map->size, size); + UVM_ASSERT(map, map->size == size, file, line); - vm_map_unlock(map); - return (0); + vmspace_validate(map); } -/* - * uvm_map_advice: set advice code for range of addrs in map. - * - * => map must be unlocked +/* + * This function validates the statistics on vmspace. */ - -int -uvm_map_advice(struct vm_map *map, vaddr_t start, vaddr_t end, int new_advice) +void +vmspace_validate(struct vm_map *map) { - struct vm_map_entry *entry; + struct vmspace *vm; + struct vm_map_entry *iter; + vaddr_t imin, imax; + vaddr_t stack_begin, stack_end; /* Position of stack. */ + vsize_t stack, heap; /* Measured sizes. */ - switch (new_advice) { - case MADV_NORMAL: - case MADV_RANDOM: - case MADV_SEQUENTIAL: - /* nothing special here */ - break; + if (!(map->flags & VM_MAP_ISVMSPACE)) + return; - default: - return (EINVAL); - } - vm_map_lock(map); - VM_MAP_RANGE_CHECK(map, start, end); - if (uvm_map_lookup_entry(map, start, &entry)) { - UVM_MAP_CLIP_START(map, entry, start); - } else { - entry = entry->next; - } + vm = (struct vmspace *)map; + stack_begin = MIN((vaddr_t)vm->vm_maxsaddr, (vaddr_t)vm->vm_minsaddr); + stack_end = MAX((vaddr_t)vm->vm_maxsaddr, (vaddr_t)vm->vm_minsaddr); - /* - * XXXJRT: disallow holes? - */ + stack = heap = 0; + RB_FOREACH(iter, uvm_map_addr, &map->addr) { + imin = imax = iter->start; - while ((entry != &map->header) && (entry->start < end)) { - UVM_MAP_CLIP_END(map, entry, end); + if (UVM_ET_ISHOLE(iter) || iter->object.uvm_obj != NULL) + continue; - entry->advice = new_advice; - entry = entry->next; + /* + * Update stack, heap. + * Keep in mind that (theoretically) the entries of + * userspace and stack may be joined. + */ + while (imin != iter->end) { + /* + * Set imax to the first boundary crossed between + * imin and stack addresses. + */ + imax = iter->end; + if (imin < stack_begin && imax > stack_begin) + imax = stack_begin; + else if (imin < stack_end && imax > stack_end) + imax = stack_end; + + if (imin >= stack_begin && imin < stack_end) + stack += imax - imin; + else + heap += imax - imin; + imin = imax; + } } - vm_map_unlock(map); - return (0); + heap >>= PAGE_SHIFT; + if (heap != vm->vm_dused) { + printf("vmspace stack range: 0x%lx-0x%lx\n", + stack_begin, stack_end); + panic("vmspace_validate: vmspace.vm_dused invalid, " + "expected %ld pgs, got %ld pgs in map %p", + heap, vm->vm_dused, + map); + } } +#endif /* VMMAP_DEBUG */ + /* - * uvm_map_pageable: sets the pageability of a range in a map. - * - * => wires map entries. should not be used for transient page locking. - * for that, use uvm_fault_wire()/uvm_fault_unwire() (see uvm_vslock()). - * => regions sepcified as not pageable require lock-down (wired) memory - * and page tables. - * => map must never be read-locked - * => if islocked is TRUE, map is already write-locked - * => we always unlock the map, since we must downgrade to a read-lock - * to call uvm_fault_wire() - * => XXXCDC: check this and try and clean it up. + * uvm_map_init: init mapping system at boot time. note that we allocate + * and init the static pool of structs vm_map_entry for the kernel here. */ - -int -uvm_map_pageable(struct vm_map *map, vaddr_t start, vaddr_t end, - boolean_t new_pageable, int lockflags) +void +uvm_map_init(void) { - struct vm_map_entry *entry, *start_entry, *failed_entry; - int rv; -#ifdef DIAGNOSTIC - u_int timestamp_save; + static struct vm_map_entry kernel_map_entry[MAX_KMAPENT]; + int lcv; + + /* + * now set up static pool of kernel map entries ... + */ + + simple_lock_init(&uvm.kentry_lock); + uvm.kentry_free = NULL; + for (lcv = 0 ; lcv < MAX_KMAPENT ; lcv++) { + RB_LEFT(&kernel_map_entry[lcv], daddrs.addr_entry) = + uvm.kentry_free; + uvm.kentry_free = &kernel_map_entry[lcv]; + } + + /* + * initialize the map-related pools. + */ + pool_init(&uvm_vmspace_pool, sizeof(struct vmspace), + 0, 0, 0, "vmsppl", &pool_allocator_nointr); + pool_init(&uvm_map_entry_pool, sizeof(struct vm_map_entry), + 0, 0, 0, "vmmpepl", &pool_allocator_nointr); + pool_init(&uvm_map_entry_kmem_pool, sizeof(struct vm_map_entry), + 0, 0, 0, "vmmpekpl", NULL); + pool_sethiwat(&uvm_map_entry_pool, 8192); + + uvm_addr_init(); +} + +#if defined(DDB) + +/* + * DDB hooks + */ + +/* + * uvm_map_printit: actually prints the map + */ +void +uvm_map_printit(struct vm_map *map, boolean_t full, + int (*pr)(const char *, ...)) +{ + struct vmspace *vm; + struct vm_map_entry *entry; + struct uvm_addr_state *free; + int in_free, i; + char buf[8]; + + (*pr)("MAP %p: [0x%lx->0x%lx]\n", map, map->min_offset,map->max_offset); + (*pr)("\tbrk() allocate range: 0x%lx-0x%lx\n", + map->b_start, map->b_end); + (*pr)("\tstack allocate range: 0x%lx-0x%lx\n", + map->s_start, map->s_end); + (*pr)("\tsz=%u, ref=%d, version=%u, flags=0x%x\n", + map->size, map->ref_count, map->timestamp, + map->flags); +#ifdef pmap_resident_count + (*pr)("\tpmap=%p(resident=%d)\n", map->pmap, + pmap_resident_count(map->pmap)); +#else + /* XXXCDC: this should be required ... */ + (*pr)("\tpmap=%p(resident=<>)\n", map->pmap); #endif - KASSERT(map->flags & VM_MAP_PAGEABLE); - if ((lockflags & UVM_LK_ENTER) == 0) + /* + * struct vmspace handling. + */ + if (map->flags & VM_MAP_ISVMSPACE) { + vm = (struct vmspace *)map; + + (*pr)("\tvm_refcnt=%d vm_shm=%p vm_rssize=%u vm_swrss=%u\n", + vm->vm_refcnt, vm->vm_shm, vm->vm_rssize, vm->vm_swrss); + (*pr)("\tvm_tsize=%u vm_dsize=%u\n", + vm->vm_tsize, vm->vm_dsize); + (*pr)("\tvm_taddr=%p vm_daddr=%p\n", + vm->vm_taddr, vm->vm_daddr); + (*pr)("\tvm_maxsaddr=%p vm_minsaddr=%p\n", + vm->vm_maxsaddr, vm->vm_minsaddr); + } + + if (!full) + goto print_uaddr; + RB_FOREACH(entry, uvm_map_addr, &map->addr) { + (*pr)(" - %p: 0x%lx->0x%lx: obj=%p/0x%llx, amap=%p/%d\n", + entry, entry->start, entry->end, entry->object.uvm_obj, + (long long)entry->offset, entry->aref.ar_amap, + entry->aref.ar_pageoff); + (*pr)("\tsubmap=%c, cow=%c, nc=%c, prot(max)=%d/%d, inh=%d, " + "wc=%d, adv=%d\n", + (entry->etype & UVM_ET_SUBMAP) ? 'T' : 'F', + (entry->etype & UVM_ET_COPYONWRITE) ? 'T' : 'F', + (entry->etype & UVM_ET_NEEDSCOPY) ? 'T' : 'F', + entry->protection, entry->max_protection, + entry->inheritance, entry->wired_count, entry->advice); + + free = uvm_map_uaddr_e(map, entry); + in_free = (free != NULL); + (*pr)("\thole=%c, free=%c, guard=0x%lx, " + "free=0x%lx-0x%lx\n", + (entry->etype & UVM_ET_HOLE) ? 'T' : 'F', + in_free ? 'T' : 'F', + entry->guard, + VMMAP_FREE_START(entry), VMMAP_FREE_END(entry)); + (*pr)("\tfreemapped=%c, uaddr=%p\n", + (entry->etype & UVM_ET_FREEMAPPED) ? 'T' : 'F', free); + if (free) { + (*pr)("\t\t(0x%lx-0x%lx %s)\n", + free->uaddr_minaddr, free->uaddr_maxaddr, + free->uaddr_functions->uaddr_name); + } + } + +print_uaddr: + uvm_addr_print(map->uaddr_exe, "exe", full, pr); + for (i = 0; i < nitems(map->uaddr_any); i++) { + snprintf(&buf[0], sizeof(buf), "any[%d]", i); + uvm_addr_print(map->uaddr_any[i], &buf[0], full, pr); + } + uvm_addr_print(map->uaddr_brk_stack, "brk/stack", full, pr); +} + +/* + * uvm_object_printit: actually prints the object + */ +void +uvm_object_printit(uobj, full, pr) + struct uvm_object *uobj; + boolean_t full; + int (*pr)(const char *, ...); +{ + struct vm_page *pg; + int cnt = 0; + + (*pr)("OBJECT %p: pgops=%p, npages=%d, ", + uobj, uobj->pgops, uobj->uo_npages); + if (UVM_OBJ_IS_KERN_OBJECT(uobj)) + (*pr)("refs=\n"); + else + (*pr)("refs=%d\n", uobj->uo_refs); + + if (!full) { + return; + } + (*pr)(" PAGES :\n "); + RB_FOREACH(pg, uvm_objtree, &uobj->memt) { + (*pr)("<%p,0x%llx> ", pg, (long long)pg->offset); + if ((cnt % 3) == 2) { + (*pr)("\n "); + } + cnt++; + } + if ((cnt % 3) != 2) { + (*pr)("\n"); + } +} + +/* + * uvm_page_printit: actually print the page + */ +static const char page_flagbits[] = + "\20\1BUSY\2WANTED\3TABLED\4CLEAN\5CLEANCHK\6RELEASED\7FAKE\10RDONLY" + "\11ZERO\15PAGER1\20FREE\21INACTIVE\22ACTIVE\24ENCRYPT\30PMAP0" + "\31PMAP1\32PMAP2\33PMAP3"; + +void +uvm_page_printit(pg, full, pr) + struct vm_page *pg; + boolean_t full; + int (*pr)(const char *, ...); +{ + struct vm_page *tpg; + struct uvm_object *uobj; + struct pglist *pgl; + + (*pr)("PAGE %p:\n", pg); + (*pr)(" flags=%b, vers=%d, wire_count=%d, pa=0x%llx\n", + pg->pg_flags, page_flagbits, pg->pg_version, pg->wire_count, + (long long)pg->phys_addr); + (*pr)(" uobject=%p, uanon=%p, offset=0x%llx loan_count=%d\n", + pg->uobject, pg->uanon, (long long)pg->offset, pg->loan_count); +#if defined(UVM_PAGE_TRKOWN) + if (pg->pg_flags & PG_BUSY) + (*pr)(" owning process = %d, tag=%s", + pg->owner, pg->owner_tag); + else + (*pr)(" page not busy, no owner"); +#else + (*pr)(" [page ownership tracking disabled]"); +#endif +#ifdef __HAVE_VM_PAGE_MD + (*pr)("\tvm_page_md %p\n", &pg->mdpage); +#else + (*pr)("\n"); +#endif + + if (!full) + return; + + /* cross-verify object/anon */ + if ((pg->pg_flags & PQ_FREE) == 0) { + if (pg->pg_flags & PQ_ANON) { + if (pg->uanon == NULL || pg->uanon->an_page != pg) + (*pr)(" >>> ANON DOES NOT POINT HERE <<< (%p)\n", + (pg->uanon) ? pg->uanon->an_page : NULL); + else + (*pr)(" anon backpointer is OK\n"); + } else { + uobj = pg->uobject; + if (uobj) { + (*pr)(" checking object list\n"); + RB_FOREACH(tpg, uvm_objtree, &uobj->memt) { + if (tpg == pg) { + break; + } + } + if (tpg) + (*pr)(" page found on object list\n"); + else + (*pr)(" >>> PAGE NOT FOUND " + "ON OBJECT LIST! <<<\n"); + } + } + } + + /* cross-verify page queue */ + if (pg->pg_flags & PQ_FREE) { + if (uvm_pmr_isfree(pg)) + (*pr)(" page found in uvm_pmemrange\n"); + else + (*pr)(" >>> page not found in uvm_pmemrange <<<\n"); + pgl = NULL; + } else if (pg->pg_flags & PQ_INACTIVE) { + pgl = (pg->pg_flags & PQ_SWAPBACKED) ? + &uvm.page_inactive_swp : &uvm.page_inactive_obj; + } else if (pg->pg_flags & PQ_ACTIVE) { + pgl = &uvm.page_active; + } else { + pgl = NULL; + } + + if (pgl) { + (*pr)(" checking pageq list\n"); + TAILQ_FOREACH(tpg, pgl, pageq) { + if (tpg == pg) { + break; + } + } + if (tpg) + (*pr)(" page found on pageq list\n"); + else + (*pr)(" >>> PAGE NOT FOUND ON PAGEQ LIST! <<<\n"); + } +} +#endif + +/* + * uvm_map_protect: change map protection + * + * => set_max means set max_protection. + * => map must be unlocked. + */ +int +uvm_map_protect(struct vm_map *map, vaddr_t start, vaddr_t end, + vm_prot_t new_prot, boolean_t set_max) +{ + struct vm_map_entry *first, *iter; + vm_prot_t old_prot; + vm_prot_t mask; + int error; + + if (start > end) + return EINVAL; + start = MAX(start, map->min_offset); + end = MIN(end, map->max_offset); + if (start >= end) + return 0; + + error = 0; + vm_map_lock(map); + + /* + * Set up first and last. + * - first will contain first entry at or after start. + */ + first = uvm_map_entrybyaddr(&map->addr, start); + KDASSERT(first != NULL); + if (first->end < start) + first = RB_NEXT(uvm_map_addr, &map->addr, first); + + /* + * First, check for protection violations. + */ + for (iter = first; iter != NULL && iter->start < end; + iter = RB_NEXT(uvm_map_addr, &map->addr, iter)) { + /* Treat memory holes as free space. */ + if (iter->start == iter->end || UVM_ET_ISHOLE(iter)) + continue; + + if (UVM_ET_ISSUBMAP(iter)) { + error = EINVAL; + goto out; + } + if ((new_prot & iter->max_protection) != new_prot) { + error = EACCES; + goto out; + } + } + + /* + * Fix protections. + */ + for (iter = first; iter != NULL && iter->start < end; + iter = RB_NEXT(uvm_map_addr, &map->addr, iter)) { + /* Treat memory holes as free space. */ + if (iter->start == iter->end || UVM_ET_ISHOLE(iter)) + continue; + + old_prot = iter->protection; + + /* + * Skip adapting protection iff old and new protection + * are equal. + */ + if (set_max) { + if (old_prot == (new_prot & old_prot) && + iter->max_protection == new_prot) + continue; + } else { + if (old_prot == new_prot) + continue; + } + + UVM_MAP_CLIP_START(map, iter, start); + UVM_MAP_CLIP_END(map, iter, end); + + if (set_max) { + iter->max_protection = new_prot; + iter->protection &= new_prot; + } else + iter->protection = new_prot; + + /* + * update physical map if necessary. worry about copy-on-write + * here -- CHECK THIS XXX + */ + if (iter->protection != old_prot) { + mask = UVM_ET_ISCOPYONWRITE(iter) ? + ~VM_PROT_WRITE : VM_PROT_ALL; + + /* update pmap */ + if ((iter->protection & mask) == PROT_NONE && + VM_MAPENT_ISWIRED(iter)) { + /* + * TODO(ariane) this is stupid. wired_count + * is 0 if not wired, otherwise anything + * larger than 0 (incremented once each time + * wire is called). + * Mostly to be able to undo the damage on + * failure. Not the actually be a wired + * refcounter... + * Originally: iter->wired_count--; + * (don't we have to unwire this in the pmap + * as well?) + */ + iter->wired_count = 0; + } + pmap_protect(map->pmap, iter->start, iter->end, + iter->protection & mask); + } + + /* + * If the map is configured to lock any future mappings, + * wire this entry now if the old protection was VM_PROT_NONE + * and the new protection is not VM_PROT_NONE. + */ + if ((map->flags & VM_MAP_WIREFUTURE) != 0 && + VM_MAPENT_ISWIRED(iter) == 0 && + old_prot == VM_PROT_NONE && + new_prot != VM_PROT_NONE) { + if (uvm_map_pageable(map, iter->start, iter->end, + FALSE, UVM_LK_ENTER | UVM_LK_EXIT) != 0) { + /* + * If locking the entry fails, remember the + * error if it's the first one. Note we + * still continue setting the protection in + * the map, but it will return the resource + * storage condition regardless. + * + * XXX Ignore what the actual error is, + * XXX just call it a resource shortage + * XXX so that it doesn't get confused + * XXX what uvm_map_protect() itself would + * XXX normally return. + */ + error = ENOMEM; + } + } + } + pmap_update(map->pmap); + +out: + vm_map_unlock(map); + return error; +} + +/* + * uvmspace_alloc: allocate a vmspace structure. + * + * - structure includes vm_map and pmap + * - XXX: no locking on this structure + * - refcnt set to 1, rest must be init'd by caller + */ +struct vmspace * +uvmspace_alloc(vaddr_t min, vaddr_t max, boolean_t pageable, + boolean_t remove_holes) +{ + struct vmspace *vm; + + vm = pool_get(&uvm_vmspace_pool, PR_WAITOK | PR_ZERO); + uvmspace_init(vm, NULL, min, max, pageable, remove_holes); + return (vm); +} + +/* + * uvmspace_init: initialize a vmspace structure. + * + * - XXX: no locking on this structure + * - refcnt set to 1, rest must be init'd by caller + */ +void +uvmspace_init(struct vmspace *vm, struct pmap *pmap, vaddr_t min, vaddr_t max, + boolean_t pageable, boolean_t remove_holes) +{ + if (pmap) + pmap_reference(pmap); + else + pmap = pmap_create(); + vm->vm_map.pmap = pmap; + + uvm_map_setup(&vm->vm_map, min, max, + (pageable ? VM_MAP_PAGEABLE : 0) | VM_MAP_ISVMSPACE); + + vm->vm_refcnt = 1; + + if (remove_holes) + pmap_remove_holes(&vm->vm_map); +} + +/* + * uvmspace_share: share a vmspace between two processes + * + * - XXX: no locking on vmspace + * - used for vfork and threads + */ + +void +uvmspace_share(p1, p2) + struct proc *p1, *p2; +{ + p2->p_vmspace = p1->p_vmspace; + p1->p_vmspace->vm_refcnt++; +} + +/* + * uvmspace_exec: the process wants to exec a new program + * + * - XXX: no locking on vmspace + */ + +void +uvmspace_exec(struct proc *p, vaddr_t start, vaddr_t end) +{ + struct vmspace *nvm, *ovm = p->p_vmspace; + struct vm_map *map = &ovm->vm_map; + struct uvm_map_deadq dead_entries; + + KASSERT((start & (vaddr_t)PAGE_MASK) == 0); + KASSERT((end & (vaddr_t)PAGE_MASK) == 0 || + (end & (vaddr_t)PAGE_MASK) == (vaddr_t)PAGE_MASK); + + pmap_unuse_final(p); /* before stack addresses go away */ + TAILQ_INIT(&dead_entries); + + /* + * see if more than one process is using this vmspace... + */ + + if (ovm->vm_refcnt == 1) { + /* + * if p is the only process using its vmspace then we can safely + * recycle that vmspace for the program that is being exec'd. + */ + +#ifdef SYSVSHM + /* + * SYSV SHM semantics require us to kill all segments on an exec + */ + if (ovm->vm_shm) + shmexit(ovm); +#endif + + /* + * POSIX 1003.1b -- "lock future mappings" is revoked + * when a process execs another program image. + */ vm_map_lock(map); + vm_map_modflags(map, 0, VM_MAP_WIREFUTURE); + + /* + * now unmap the old program + * + * Instead of attempting to keep the map valid, we simply + * nuke all entries and ask uvm_map_setup to reinitialize + * the map to the new boundaries. + * + * uvm_unmap_remove will actually nuke all entries for us + * (as in, not replace them with free-memory entries). + */ + uvm_unmap_remove(map, map->min_offset, map->max_offset, + &dead_entries, TRUE, FALSE); + + KDASSERT(RB_EMPTY(&map->addr)); + + /* + * Nuke statistics and boundaries. + */ + bzero(&ovm->vm_startcopy, + (caddr_t) (ovm + 1) - (caddr_t) &ovm->vm_startcopy); + + + if (end & (vaddr_t)PAGE_MASK) { + end += 1; + if (end == 0) /* overflow */ + end -= PAGE_SIZE; + } + + /* + * Setup new boundaries and populate map with entries. + */ + map->min_offset = start; + map->max_offset = end; + uvm_map_setup_entries(map); + vm_map_unlock(map); + + /* + * but keep MMU holes unavailable + */ + pmap_remove_holes(map); + + } else { + + /* + * p's vmspace is being shared, so we can't reuse it for p since + * it is still being used for others. allocate a new vmspace + * for p + */ + nvm = uvmspace_alloc(start, end, + (map->flags & VM_MAP_PAGEABLE) ? TRUE : FALSE, TRUE); + + /* + * install new vmspace and drop our ref to the old one. + */ + + pmap_deactivate(p); + p->p_vmspace = nvm; + pmap_activate(p); + + uvmspace_free(ovm); + } + + /* + * Release dead entries + */ + uvm_unmap_detach(&dead_entries, 0); +} + +/* + * uvmspace_free: free a vmspace data structure + * + * - XXX: no locking on vmspace + */ + +void +uvmspace_free(struct vmspace *vm) +{ + if (--vm->vm_refcnt == 0) { + /* + * lock the map, to wait out all other references to it. delete + * all of the mappings and pages they hold, then call the pmap + * module to reclaim anything left. + */ +#ifdef SYSVSHM + /* Get rid of any SYSV shared memory segments. */ + if (vm->vm_shm != NULL) + shmexit(vm); +#endif + + uvm_map_teardown(&vm->vm_map); + pool_put(&uvm_vmspace_pool, vm); + } +} + +/* + * Clone map entry into other map. + * + * Mapping will be placed at dstaddr, for the same length. + * Space must be available. + * Reference counters are incremented. + */ +struct vm_map_entry* +uvm_mapent_clone(struct vm_map *dstmap, vaddr_t dstaddr, vsize_t dstlen, + vsize_t off, struct vm_map_entry *old_entry, struct uvm_map_deadq *dead, + int mapent_flags, int amap_share_flags) +{ + struct vm_map_entry *new_entry, *first, *last; + + KDASSERT(!UVM_ET_ISSUBMAP(old_entry)); + + /* + * Create new entry (linked in on creation). + * Fill in first, last. + */ + first = last = NULL; + if (!uvm_map_isavail(dstmap, NULL, &first, &last, dstaddr, dstlen)) { + panic("uvmspace_fork: no space in map for " + "entry in empty map"); + } + new_entry = uvm_map_mkentry(dstmap, first, last, + dstaddr, dstlen, mapent_flags, dead); + if (new_entry == NULL) + return NULL; + /* old_entry -> new_entry */ + new_entry->object = old_entry->object; + new_entry->offset = old_entry->offset; + new_entry->aref = old_entry->aref; + new_entry->etype |= old_entry->etype & ~UVM_ET_FREEMAPPED; + new_entry->protection = old_entry->protection; + new_entry->max_protection = old_entry->max_protection; + new_entry->inheritance = old_entry->inheritance; + new_entry->advice = old_entry->advice; + + /* + * gain reference to object backing the map (can't + * be a submap). + */ + if (new_entry->aref.ar_amap) { + new_entry->aref.ar_pageoff += off >> PAGE_SHIFT; + amap_ref(new_entry->aref.ar_amap, new_entry->aref.ar_pageoff, + (new_entry->end - new_entry->start) >> PAGE_SHIFT, + amap_share_flags); + } + + if (UVM_ET_ISOBJ(new_entry) && + new_entry->object.uvm_obj->pgops->pgo_reference) { + new_entry->offset += off; + new_entry->object.uvm_obj->pgops->pgo_reference + (new_entry->object.uvm_obj); + } + + return new_entry; +} + +/* + * share the mapping: this means we want the old and + * new entries to share amaps and backing objects. + */ +void +uvm_mapent_forkshared(struct vmspace *new_vm, struct vm_map *new_map, + struct vm_map *old_map, + struct vm_map_entry *old_entry, struct uvm_map_deadq *dead) +{ + struct vm_map_entry *new_entry; + + /* + * if the old_entry needs a new amap (due to prev fork) + * then we need to allocate it now so that we have + * something we own to share with the new_entry. [in + * other words, we need to clear needs_copy] + */ + + if (UVM_ET_ISNEEDSCOPY(old_entry)) { + /* get our own amap, clears needs_copy */ + amap_copy(old_map, old_entry, M_WAITOK, FALSE, + 0, 0); + /* XXXCDC: WAITOK??? */ + } + + new_entry = uvm_mapent_clone(new_map, old_entry->start, + old_entry->end - old_entry->start, 0, old_entry, + dead, 0, AMAP_SHARED); + + /* + * pmap_copy the mappings: this routine is optional + * but if it is there it will reduce the number of + * page faults in the new proc. + */ + pmap_copy(new_map->pmap, old_map->pmap, new_entry->start, + (new_entry->end - new_entry->start), new_entry->start); + + /* + * Update process statistics. + */ + if (!UVM_ET_ISHOLE(new_entry)) + new_map->size += new_entry->end - new_entry->start; + if (!UVM_ET_ISOBJ(new_entry) && !UVM_ET_ISHOLE(new_entry)) { + new_vm->vm_dused += + uvmspace_dused(new_map, new_entry->start, new_entry->end); + } +} + +/* + * copy-on-write the mapping (using mmap's + * MAP_PRIVATE semantics) + * + * allocate new_entry, adjust reference counts. + * (note that new references are read-only). + */ +void +uvm_mapent_forkcopy(struct vmspace *new_vm, struct vm_map *new_map, + struct vm_map *old_map, + struct vm_map_entry *old_entry, struct uvm_map_deadq *dead) +{ + struct vm_map_entry *new_entry; + boolean_t protect_child; - VM_MAP_RANGE_CHECK(map, start, end); + new_entry = uvm_mapent_clone(new_map, old_entry->start, + old_entry->end - old_entry->start, 0, old_entry, + dead, 0, 0); - /* - * only one pageability change may take place at one time, since - * uvm_fault_wire assumes it will be called only once for each - * wiring/unwiring. therefore, we have to make sure we're actually - * changing the pageability for the entire region. we do so before - * making any changes. - */ + new_entry->etype |= + (UVM_ET_COPYONWRITE|UVM_ET_NEEDSCOPY); - if (uvm_map_lookup_entry(map, start, &start_entry) == FALSE) { - if ((lockflags & UVM_LK_EXIT) == 0) - vm_map_unlock(map); + /* + * the new entry will need an amap. it will either + * need to be copied from the old entry or created + * from scratch (if the old entry does not have an + * amap). can we defer this process until later + * (by setting "needs_copy") or do we need to copy + * the amap now? + * + * we must copy the amap now if any of the following + * conditions hold: + * 1. the old entry has an amap and that amap is + * being shared. this means that the old (parent) + * process is sharing the amap with another + * process. if we do not clear needs_copy here + * we will end up in a situation where both the + * parent and child process are referring to the + * same amap with "needs_copy" set. if the + * parent write-faults, the fault routine will + * clear "needs_copy" in the parent by allocating + * a new amap. this is wrong because the + * parent is supposed to be sharing the old amap + * and the new amap will break that. + * + * 2. if the old entry has an amap and a non-zero + * wire count then we are going to have to call + * amap_cow_now to avoid page faults in the + * parent process. since amap_cow_now requires + * "needs_copy" to be clear we might as well + * clear it here as well. + * + */ - return (EFAULT); + if (old_entry->aref.ar_amap != NULL && + ((amap_flags(old_entry->aref.ar_amap) & + AMAP_SHARED) != 0 || + VM_MAPENT_ISWIRED(old_entry))) { + amap_copy(new_map, new_entry, M_WAITOK, FALSE, + 0, 0); + /* XXXCDC: M_WAITOK ... ok? */ } - entry = start_entry; - /* - * handle wiring and unwiring separately. + /* + * if the parent's entry is wired down, then the + * parent process does not want page faults on + * access to that memory. this means that we + * cannot do copy-on-write because we can't write + * protect the old entry. in this case we + * resolve all copy-on-write faults now, using + * amap_cow_now. note that we have already + * allocated any needed amap (above). */ - if (new_pageable) { /* unwire */ - UVM_MAP_CLIP_START(map, entry, start); + if (VM_MAPENT_ISWIRED(old_entry)) { - /* - * unwiring. first ensure that the range to be unwired is - * really wired down and that there are no holes. + /* + * resolve all copy-on-write faults now + * (note that there is nothing to do if + * the old mapping does not have an amap). + * XXX: is it worthwhile to bother with + * pmap_copy in this case? */ + if (old_entry->aref.ar_amap) + amap_cow_now(new_map, new_entry); + + } else { + if (old_entry->aref.ar_amap) { - while ((entry != &map->header) && (entry->start < end)) { - if (entry->wired_count == 0 || - (entry->end < end && - (entry->next == &map->header || - entry->next->start > entry->end))) { - if ((lockflags & UVM_LK_EXIT) == 0) - vm_map_unlock(map); - return (EINVAL); + /* + * setup mappings to trigger copy-on-write faults + * we must write-protect the parent if it has + * an amap and it is not already "needs_copy"... + * if it is already "needs_copy" then the parent + * has already been write-protected by a previous + * fork operation. + * + * if we do not write-protect the parent, then + * we must be sure to write-protect the child + * after the pmap_copy() operation. + * + * XXX: pmap_copy should have some way of telling + * us that it didn't do anything so we can avoid + * calling pmap_protect needlessly. + */ + if (!UVM_ET_ISNEEDSCOPY(old_entry)) { + if (old_entry->max_protection & + VM_PROT_WRITE) { + pmap_protect(old_map->pmap, + old_entry->start, + old_entry->end, + old_entry->protection & + ~VM_PROT_WRITE); + pmap_update(old_map->pmap); + } + old_entry->etype |= UVM_ET_NEEDSCOPY; } - entry = entry->next; + + /* + * parent must now be write-protected + */ + protect_child = FALSE; + } else { + + /* + * we only need to protect the child if the + * parent has write access. + */ + if (old_entry->max_protection & VM_PROT_WRITE) + protect_child = TRUE; + else + protect_child = FALSE; + } - /* - * POSIX 1003.1b - a single munlock call unlocks a region, - * regardless of the number of mlock calls made on that - * region. + /* + * copy the mappings + * XXX: need a way to tell if this does anything */ - entry = start_entry; - while ((entry != &map->header) && (entry->start < end)) { - UVM_MAP_CLIP_END(map, entry, end); - if (VM_MAPENT_ISWIRED(entry)) - uvm_map_entry_unwire(map, entry); - entry = entry->next; + pmap_copy(new_map->pmap, old_map->pmap, + new_entry->start, + (old_entry->end - old_entry->start), + old_entry->start); + + /* + * protect the child's mappings if necessary + */ + if (protect_child) { + pmap_protect(new_map->pmap, new_entry->start, + new_entry->end, + new_entry->protection & + ~VM_PROT_WRITE); } - if ((lockflags & UVM_LK_EXIT) == 0) - vm_map_unlock(map); - return (0); } /* - * wire case: in two passes [XXXCDC: ugly block of code here] - * - * 1: holding the write lock, we create any anonymous maps that need - * to be created. then we clip each map entry to the region to - * be wired and increment its wiring count. - * - * 2: we downgrade to a read lock, and call uvm_fault_wire to fault - * in the pages for any newly wired area (wired_count == 1). - * - * downgrading to a read lock for uvm_fault_wire avoids a possible - * deadlock with another thread that may have faulted on one of - * the pages to be wired (it would mark the page busy, blocking - * us, then in turn block on the map lock that we hold). because - * of problems in the recursive lock package, we cannot upgrade - * to a write lock in vm_map_lookup. thus, any actions that - * require the write lock must be done beforehand. because we - * keep the read lock on the map, the copy-on-write status of the - * entries we modify here cannot change. + * Update process statistics. */ + if (!UVM_ET_ISHOLE(new_entry)) + new_map->size += new_entry->end - new_entry->start; + if (!UVM_ET_ISOBJ(new_entry) && !UVM_ET_ISHOLE(new_entry)) { + new_vm->vm_dused += + uvmspace_dused(new_map, new_entry->start, new_entry->end); + } +} - while ((entry != &map->header) && (entry->start < end)) { - if (VM_MAPENT_ISWIRED(entry) == 0) { /* not already wired? */ +/* + * uvmspace_fork: fork a process' main map + * + * => create a new vmspace for child process from parent. + * => parent's map must not be locked. + */ +struct vmspace * +uvmspace_fork(struct vmspace *vm1) +{ + struct vmspace *vm2; + struct vm_map *old_map = &vm1->vm_map; + struct vm_map *new_map; + struct vm_map_entry *old_entry; + struct uvm_map_deadq dead; - /* - * perform actions of vm_map_lookup that need the - * write lock on the map: create an anonymous map - * for a copy-on-write region, or an anonymous map - * for a zero-fill region. (XXXCDC: submap case - * ok?) - */ + vm_map_lock(old_map); - if (!UVM_ET_ISSUBMAP(entry)) { /* not submap */ - if (UVM_ET_ISNEEDSCOPY(entry) && - ((entry->protection & VM_PROT_WRITE) || - (entry->object.uvm_obj == NULL))) { - amap_copy(map, entry, M_WAITOK, TRUE, - start, end); - /* XXXCDC: wait OK? */ - } - } + vm2 = uvmspace_alloc(old_map->min_offset, old_map->max_offset, + (old_map->flags & VM_MAP_PAGEABLE) ? TRUE : FALSE, FALSE); + memcpy(&vm2->vm_startcopy, &vm1->vm_startcopy, + (caddr_t) (vm1 + 1) - (caddr_t) &vm1->vm_startcopy); + vm2->vm_dused = 0; /* Statistic managed by us. */ + new_map = &vm2->vm_map; + vm_map_lock(new_map); + + /* + * go entry-by-entry + */ + + TAILQ_INIT(&dead); + RB_FOREACH(old_entry, uvm_map_addr, &old_map->addr) { + if (old_entry->start == old_entry->end) + continue; + + /* + * first, some sanity checks on the old entry + */ + if (UVM_ET_ISSUBMAP(old_entry)) { + panic("fork: encountered a submap during fork " + "(illegal)"); + } + + if (!UVM_ET_ISCOPYONWRITE(old_entry) && + UVM_ET_ISNEEDSCOPY(old_entry)) { + panic("fork: non-copy_on_write map entry marked " + "needs_copy (illegal)"); + } + + /* + * Apply inheritance. + */ + if (old_entry->inheritance == MAP_INHERIT_SHARE) { + uvm_mapent_forkshared(vm2, new_map, + old_map, old_entry, &dead); } + if (old_entry->inheritance == MAP_INHERIT_COPY) { + uvm_mapent_forkcopy(vm2, new_map, + old_map, old_entry, &dead); + } + } + + vm_map_unlock(old_map); + vm_map_unlock(new_map); + + /* + * This can actually happen, if multiple entries described a + * space in which an entry was inherited. + */ + uvm_unmap_detach(&dead, 0); + +#ifdef SYSVSHM + if (vm1->vm_shm) + shmfork(vm1, vm2); +#endif + +#ifdef PMAP_FORK + pmap_fork(vm1->vm_map.pmap, vm2->vm_map.pmap); +#endif + + return vm2; +} + +/* + * uvm_map_hint: return the beginning of the best area suitable for + * creating a new mapping with "prot" protection. + */ +vaddr_t +uvm_map_hint(struct vmspace *vm, vm_prot_t prot) +{ + vaddr_t addr; + +#ifdef __i386__ + /* + * If executable skip first two pages, otherwise start + * after data + heap region. + */ + if ((prot & VM_PROT_EXECUTE) != 0 && + (vaddr_t)vm->vm_daddr >= I386_MAX_EXE_ADDR) { + addr = (PAGE_SIZE*2) + + (arc4random() & (I386_MAX_EXE_ADDR / 2 - 1)); + return (round_page(addr)); + } +#endif + /* start malloc/mmap after the brk */ + addr = (vaddr_t)vm->vm_daddr + BRKSIZ; +#if !defined(__vax__) + addr += arc4random() & (MIN((256 * 1024 * 1024), BRKSIZ) - 1); +#endif + return (round_page(addr)); +} + +/* + * uvm_map_submap: punch down part of a map into a submap + * + * => only the kernel_map is allowed to be submapped + * => the purpose of submapping is to break up the locking granularity + * of a larger map + * => the range specified must have been mapped previously with a uvm_map() + * call [with uobj==NULL] to create a blank map entry in the main map. + * [And it had better still be blank!] + * => maps which contain submaps should never be copied or forked. + * => to remove a submap, use uvm_unmap() on the main map + * and then uvm_map_deallocate() the submap. + * => main map must be unlocked. + * => submap must have been init'd and have a zero reference count. + * [need not be locked as we don't actually reference it] + */ +int +uvm_map_submap(struct vm_map *map, vaddr_t start, vaddr_t end, + struct vm_map *submap) +{ + struct vm_map_entry *entry; + int result; + + if (start > map->max_offset || end > map->max_offset || + start < map->min_offset || end < map->min_offset) + return EINVAL; + + vm_map_lock(map); + + if (uvm_map_lookup_entry(map, start, &entry)) { UVM_MAP_CLIP_START(map, entry, start); UVM_MAP_CLIP_END(map, entry, end); - entry->wired_count++; + } else + entry = NULL; + + if (entry != NULL && + entry->start == start && entry->end == end && + entry->object.uvm_obj == NULL && entry->aref.ar_amap == NULL && + !UVM_ET_ISCOPYONWRITE(entry) && !UVM_ET_ISNEEDSCOPY(entry)) { + entry->etype |= UVM_ET_SUBMAP; + entry->object.sub_map = submap; + entry->offset = 0; + uvm_map_reference(submap); + result = 0; + } else + result = EINVAL; + + vm_map_unlock(map); + return(result); +} + +/* + * uvm_map_checkprot: check protection in map + * + * => must allow specific protection in a fully allocated region. + * => map mut be read or write locked by caller. + */ +boolean_t +uvm_map_checkprot(struct vm_map *map, vaddr_t start, vaddr_t end, + vm_prot_t protection) +{ + struct vm_map_entry *entry; + + if (start < map->min_offset || end > map->max_offset || start > end) + return FALSE; + if (start == end) + return TRUE; + /* + * Iterate entries. + */ + for (entry = uvm_map_entrybyaddr(&map->addr, start); + entry != NULL && entry->start < end; + entry = RB_NEXT(uvm_map_addr, &map->addr, entry)) { /* - * Check for holes + * Fail if a hole is found. */ + if (UVM_ET_ISHOLE(entry) || + (entry->end < end && entry->end != VMMAP_FREE_END(entry))) + return FALSE; - if (entry->protection == VM_PROT_NONE || - (entry->end < end && - (entry->next == &map->header || - entry->next->start > entry->end))) { + /* + * Check protection. + */ + if ((entry->protection & protection) != protection) + return FALSE; + } + return TRUE; +} - /* - * found one. amap creation actions do not need to - * be undone, but the wired counts need to be restored. - */ +/* + * uvm_map_create: create map + */ +vm_map_t +uvm_map_create(pmap_t pmap, vaddr_t min, vaddr_t max, int flags) +{ + vm_map_t result; - while (entry != &map->header && entry->end > start) { - entry->wired_count--; - entry = entry->prev; - } - if ((lockflags & UVM_LK_EXIT) == 0) - vm_map_unlock(map); - return (EINVAL); - } - entry = entry->next; + result = malloc(sizeof(struct vm_map), M_VMMAP, M_WAITOK); + result->pmap = pmap; + uvm_map_setup(result, min, max, flags); + return(result); +} + +/* + * uvm_map_deallocate: drop reference to a map + * + * => caller must not lock map + * => we will zap map if ref count goes to zero + */ +void +uvm_map_deallocate(vm_map_t map) +{ + int c; + struct uvm_map_deadq dead; + + simple_lock(&map->ref_lock); + c = --map->ref_count; + simple_unlock(&map->ref_lock); + if (c > 0) { + return; } /* - * Pass 2. + * all references gone. unmap and free. + * + * No lock required: we are only one to access this map. */ -#ifdef DIAGNOSTIC - timestamp_save = map->timestamp; -#endif - vm_map_busy(map); - vm_map_downgrade(map); + TAILQ_INIT(&dead); + uvm_tree_sanity(map, __FILE__, __LINE__); + uvm_unmap_remove(map, map->min_offset, map->max_offset, &dead, + TRUE, FALSE); + pmap_destroy(map->pmap); + KASSERT(RB_EMPTY(&map->addr)); + free(map, M_VMMAP); - rv = 0; - entry = start_entry; - while (entry != &map->header && entry->start < end) { - if (entry->wired_count == 1) { - rv = uvm_fault_wire(map, entry->start, entry->end, - entry->protection); - if (rv) { - /* - * wiring failed. break out of the loop. - * we'll clean up the map below, once we - * have a write lock again. - */ - break; - } - } - entry = entry->next; + uvm_unmap_detach(&dead, 0); +} + +/* + * uvm_map_inherit: set inheritance code for range of addrs in map. + * + * => map must be unlocked + * => note that the inherit code is used during a "fork". see fork + * code for details. + */ +int +uvm_map_inherit(struct vm_map *map, vaddr_t start, vaddr_t end, + vm_inherit_t new_inheritance) +{ + struct vm_map_entry *entry; + + switch (new_inheritance) { + case MAP_INHERIT_NONE: + case MAP_INHERIT_COPY: + case MAP_INHERIT_SHARE: + break; + default: + return (EINVAL); } - if (rv) { /* failed? */ - - /* - * Get back to an exclusive (write) lock. - */ + if (start > end) + return EINVAL; + start = MAX(start, map->min_offset); + end = MIN(end, map->max_offset); + if (start >= end) + return 0; - vm_map_upgrade(map); - vm_map_unbusy(map); + vm_map_lock(map); -#ifdef DIAGNOSTIC - if (timestamp_save != map->timestamp) - panic("uvm_map_pageable: stale map"); -#endif + entry = uvm_map_entrybyaddr(&map->addr, start); + if (entry->end > start) + UVM_MAP_CLIP_START(map, entry, start); + else + entry = RB_NEXT(uvm_map_addr, &map->addr, entry); - /* - * first drop the wiring count on all the entries - * which haven't actually been wired yet. - */ + while (entry != NULL && entry->start < end) { + UVM_MAP_CLIP_END(map, entry, end); + entry->inheritance = new_inheritance; + entry = RB_NEXT(uvm_map_addr, &map->addr, entry); + } - failed_entry = entry; - while (entry != &map->header && entry->start < end) { - entry->wired_count--; - entry = entry->next; - } + vm_map_unlock(map); + return (0); +} - /* - * now, unwire all the entries that were successfully - * wired above. - */ +/* + * uvm_map_advice: set advice code for range of addrs in map. + * + * => map must be unlocked + */ +int +uvm_map_advice(struct vm_map *map, vaddr_t start, vaddr_t end, int new_advice) +{ + struct vm_map_entry *entry; - entry = start_entry; - while (entry != failed_entry) { - entry->wired_count--; - if (VM_MAPENT_ISWIRED(entry) == 0) - uvm_map_entry_unwire(map, entry); - entry = entry->next; - } - if ((lockflags & UVM_LK_EXIT) == 0) - vm_map_unlock(map); - return(rv); + switch (new_advice) { + case MADV_NORMAL: + case MADV_RANDOM: + case MADV_SEQUENTIAL: + break; + default: + return (EINVAL); } - /* We are holding a read lock here. */ - if ((lockflags & UVM_LK_EXIT) == 0) { - vm_map_unbusy(map); - vm_map_unlock_read(map); - } else { + if (start > end) + return EINVAL; + start = MAX(start, map->min_offset); + end = MIN(end, map->max_offset); + if (start >= end) + return 0; - /* - * Get back to an exclusive (write) lock. - */ + vm_map_lock(map); - vm_map_upgrade(map); - vm_map_unbusy(map); + entry = uvm_map_entrybyaddr(&map->addr, start); + if (entry != NULL && entry->end > start) + UVM_MAP_CLIP_START(map, entry, start); + else if (entry!= NULL) + entry = RB_NEXT(uvm_map_addr, &map->addr, entry); + + /* + * XXXJRT: disallow holes? + */ + + while (entry != NULL && entry->start < end) { + UVM_MAP_CLIP_END(map, entry, end); + entry->advice = new_advice; + entry = RB_NEXT(uvm_map_addr, &map->addr, entry); } + vm_map_unlock(map); return (0); } /* - * uvm_map_pageable_all: special case of uvm_map_pageable - affects - * all mapped regions. + * uvm_map_extract: extract a mapping from a map and put it somewhere + * in the kernel_map, setting protection to max_prot. * - * => map must not be locked. - * => if no flags are specified, all regions are unwired. - * => XXXJRT: has some of the same problems as uvm_map_pageable() above. + * => map should be unlocked (we will write lock it and kernel_map) + * => returns 0 on success, error code otherwise + * => start must be page aligned + * => len must be page sized + * => flags: + * UVM_EXTRACT_FIXPROT: set prot to maxprot as we go + * Mappings are QREF's. */ - int -uvm_map_pageable_all(struct vm_map *map, int flags, vsize_t limit) +uvm_map_extract(struct vm_map *srcmap, vaddr_t start, vsize_t len, + vaddr_t *dstaddrp, int flags) { - struct vm_map_entry *entry, *failed_entry; - vsize_t size; + struct uvm_map_deadq dead; + struct vm_map_entry *first, *entry, *newentry, *tmp1, *tmp2; + vaddr_t dstaddr; + vaddr_t end; + vaddr_t cp_start; + vsize_t cp_len, cp_off; int error; -#ifdef DIAGNOSTIC - u_int timestamp_save; -#endif - - KASSERT(map->flags & VM_MAP_PAGEABLE); - vm_map_lock(map); + TAILQ_INIT(&dead); + end = start + len; /* - * handle wiring and unwiring separately. + * Sanity check on the parameters. + * Also, since the mapping may not contain gaps, error out if the + * mapped area is not in source map. */ - if (flags == 0) { /* unwire */ - /* - * POSIX 1003.1b -- munlockall unlocks all regions, - * regardless of how many times mlockall has been called. - */ - for (entry = map->header.next; entry != &map->header; - entry = entry->next) { - if (VM_MAPENT_ISWIRED(entry)) - uvm_map_entry_unwire(map, entry); - } - vm_map_modflags(map, 0, VM_MAP_WIREFUTURE); - vm_map_unlock(map); - return (0); + if ((start & (vaddr_t)PAGE_MASK) != 0 || + (end & (vaddr_t)PAGE_MASK) != 0 || end < start) + return EINVAL; + if (start < srcmap->min_offset || end > srcmap->max_offset) + return EINVAL; - /* - * end of unwire case! - */ - } + /* + * Initialize dead entries. + * Handle len == 0 case. + */ - if (flags & MCL_FUTURE) { - /* - * must wire all future mappings; remember this. - */ - vm_map_modflags(map, VM_MAP_WIREFUTURE, 0); - } + if (len == 0) + return 0; - if ((flags & MCL_CURRENT) == 0) { - /* - * no more work to do! - */ - vm_map_unlock(map); - return (0); - } + /* + * Acquire lock on srcmap. + */ + vm_map_lock(srcmap); /* - * wire case: in three passes [XXXCDC: ugly block of code here] - * - * 1: holding the write lock, count all pages mapped by non-wired - * entries. if this would cause us to go over our limit, we fail. - * - * 2: still holding the write lock, we create any anonymous maps that - * need to be created. then we increment its wiring count. - * - * 3: we downgrade to a read lock, and call uvm_fault_wire to fault - * in the pages for any newly wired area (wired_count == 1). - * - * downgrading to a read lock for uvm_fault_wire avoids a possible - * deadlock with another thread that may have faulted on one of - * the pages to be wired (it would mark the page busy, blocking - * us, then in turn block on the map lock that we hold). because - * of problems in the recursive lock package, we cannot upgrade - * to a write lock in vm_map_lookup. thus, any actions that - * require the write lock must be done beforehand. because we - * keep the read lock on the map, the copy-on-write status of the - * entries we modify here cannot change. - */ - - for (size = 0, entry = map->header.next; entry != &map->header; - entry = entry->next) { - if (entry->protection != VM_PROT_NONE && - VM_MAPENT_ISWIRED(entry) == 0) { /* not already wired? */ - size += entry->end - entry->start; - } - } + * Lock srcmap, lookup first and last entry in . + */ + first = uvm_map_entrybyaddr(&srcmap->addr, start); - if (atop(size) + uvmexp.wired > uvmexp.wiredmax) { - vm_map_unlock(map); - return (ENOMEM); /* XXX overloaded */ + /* + * Check that the range is contiguous. + */ + for (entry = first; entry != NULL && entry->end < end; + entry = RB_NEXT(uvm_map_addr, &map->addr, entry)) { + if (VMMAP_FREE_END(entry) != entry->end || + UVM_ET_ISHOLE(entry)) { + error = EINVAL; + goto fail; + } } - - /* XXX non-pmap_wired_count case must be handled by caller */ -#ifdef pmap_wired_count - if (limit != 0 && - (size + ptoa(pmap_wired_count(vm_map_pmap(map))) > limit)) { - vm_map_unlock(map); - return (ENOMEM); /* XXX overloaded */ + if (entry == NULL || UVM_ET_ISHOLE(entry)) { + error = EINVAL; + goto fail; } -#endif /* - * Pass 2. + * Handle need-copy flag. + * This may invalidate last, hence the re-initialization during the + * loop. + * + * Also, perform clipping of last if not UVM_EXTRACT_QREF. */ - - for (entry = map->header.next; entry != &map->header; - entry = entry->next) { - if (entry->protection == VM_PROT_NONE) - continue; - if (VM_MAPENT_ISWIRED(entry) == 0) { /* not already wired? */ + for (entry = first; entry != NULL && entry->start < end; + entry = RB_NEXT(uvm_map_addr, &map->addr, entry)) { + if (UVM_ET_ISNEEDSCOPY(entry)) + amap_copy(srcmap, entry, M_NOWAIT, TRUE, start, end); + if (UVM_ET_ISNEEDSCOPY(entry)) { /* - * perform actions of vm_map_lookup that need the - * write lock on the map: create an anonymous map - * for a copy-on-write region, or an anonymous map - * for a zero-fill region. (XXXCDC: submap case - * ok?) + * amap_copy failure */ - if (!UVM_ET_ISSUBMAP(entry)) { /* not submap */ - if (UVM_ET_ISNEEDSCOPY(entry) && - ((entry->protection & VM_PROT_WRITE) || - (entry->object.uvm_obj == NULL))) { - amap_copy(map, entry, M_WAITOK, TRUE, - entry->start, entry->end); - /* XXXCDC: wait OK? */ - } - } + error = ENOMEM; + goto fail; } - entry->wired_count++; } /* - * Pass 3. + * Lock destination map (kernel_map). */ + vm_map_lock(kernel_map); -#ifdef DIAGNOSTIC - timestamp_save = map->timestamp; -#endif - vm_map_busy(map); - vm_map_downgrade(map); - - for (error = 0, entry = map->header.next; - entry != &map->header && error == 0; - entry = entry->next) { - if (entry->wired_count == 1) { - error = uvm_fault_wire(map, entry->start, entry->end, - entry->protection); - } + if (uvm_map_findspace(kernel_map, &tmp1, &tmp2, &dstaddr, len, + PAGE_SIZE, 0, VM_PROT_NONE, 0) != 0) { + error = ENOMEM; + goto fail2; } + *dstaddrp = dstaddr; - if (error) { /* failed? */ - /* - * Get back an exclusive (write) lock. - */ - vm_map_upgrade(map); - vm_map_unbusy(map); + /* + * We now have srcmap and kernel_map locked. + * dstaddr contains the destination offset in dstmap. + */ -#ifdef DIAGNOSTIC - if (timestamp_save != map->timestamp) - panic("uvm_map_pageable_all: stale map"); -#endif + /* + * step 1: start looping through map entries, performing extraction. + */ + for (entry = first; entry != NULL && entry->start < end; + entry = RB_NEXT(uvm_map_addr, &map->addr, entry)) { + KDASSERT(!UVM_ET_ISNEEDSCOPY(entry)); + if (UVM_ET_ISHOLE(entry)) + continue; /* - * first drop the wiring count on all the entries - * which haven't actually been wired yet. - * - * Skip VM_PROT_NONE entries like we did above. + * Calculate uvm_mapent_clone parameters. */ - failed_entry = entry; - for (/* nothing */; entry != &map->header; - entry = entry->next) { - if (entry->protection == VM_PROT_NONE) - continue; - entry->wired_count--; + cp_start = entry->start; + if (cp_start < start) { + cp_off = start - cp_start; + cp_start = start; + } else + cp_off = 0; + cp_len = MIN(entry->end, end) - cp_start; + + newentry = uvm_mapent_clone(kernel_map, + cp_start - start + dstaddr, cp_len, cp_off, + entry, &dead, flags, AMAP_SHARED | AMAP_REFALL); + if (newentry == NULL) { + error = ENOMEM; + goto fail2_unmap; } + kernel_map->size += cp_len; + if (flags & UVM_EXTRACT_FIXPROT) + newentry->protection = newentry->max_protection; /* - * now, unwire all the entries that were successfully - * wired above. - * - * Skip VM_PROT_NONE entries like we did above. + * Step 2: perform pmap copy. + * (Doing this in the loop saves one RB traversal.) */ - for (entry = map->header.next; entry != failed_entry; - entry = entry->next) { - if (entry->protection == VM_PROT_NONE) - continue; - entry->wired_count--; - if (VM_MAPENT_ISWIRED(entry)) - uvm_map_entry_unwire(map, entry); - } - vm_map_unlock(map); - return (error); + pmap_copy(kernel_map->pmap, srcmap->pmap, + cp_start - start + dstaddr, cp_len, cp_start); } + pmap_update(kernel_map->pmap); - /* We are holding a read lock here. */ - vm_map_unbusy(map); - vm_map_unlock_read(map); + error = 0; - return (0); + /* + * Unmap copied entries on failure. + */ +fail2_unmap: + if (error) { + uvm_unmap_remove(kernel_map, dstaddr, dstaddr + len, &dead, + FALSE, TRUE); + } + + /* + * Release maps, release dead entries. + */ +fail2: + vm_map_unlock(kernel_map); + +fail: + vm_map_unlock(srcmap); + + uvm_unmap_detach(&dead, 0); + + return error; } /* @@ -2913,71 +4066,72 @@ int amap_clean_works = 1; /* XXX for now, just in case... */ int uvm_map_clean(struct vm_map *map, vaddr_t start, vaddr_t end, int flags) { - struct vm_map_entry *current, *entry; - struct uvm_object *uobj; + struct vm_map_entry *first, *entry; struct vm_amap *amap; struct vm_anon *anon; struct vm_page *pg; - vaddr_t offset; - vsize_t size; - int rv, error, refs; + struct uvm_object *uobj; + vaddr_t cp_start, cp_end; + int refs; + int error; + boolean_t rv; KASSERT((flags & (PGO_FREE|PGO_DEACTIVATE)) != - (PGO_FREE|PGO_DEACTIVATE)); + (PGO_FREE|PGO_DEACTIVATE)); + + if (start > end || start < map->min_offset || end > map->max_offset) + return EINVAL; vm_map_lock_read(map); - VM_MAP_RANGE_CHECK(map, start, end); - if (uvm_map_lookup_entry(map, start, &entry) == FALSE) { - vm_map_unlock_read(map); - return (EFAULT); - } + first = uvm_map_entrybyaddr(&map->addr, start); /* * Make a first pass to check for holes. */ - - for (current = entry; current->start < end; current = current->next) { - if (UVM_ET_ISSUBMAP(current)) { + for (entry = first; entry->start < end; + entry = RB_NEXT(uvm_map_addr, &map->addr, entry)) { + if (UVM_ET_ISSUBMAP(entry)) { vm_map_unlock_read(map); - return (EINVAL); + return EINVAL; } - if (end > current->end && (current->next == &map->header || - current->end != current->next->start)) { + if (UVM_ET_ISSUBMAP(entry) || + UVM_ET_ISHOLE(entry) || + (entry->end < end && + VMMAP_FREE_END(entry) != entry->end)) { vm_map_unlock_read(map); - return (EFAULT); + return EFAULT; } } error = 0; - - for (current = entry; current->start < end; current = current->next) { - amap = current->aref.ar_amap; /* top layer */ - uobj = current->object.uvm_obj; /* bottom layer */ - KASSERT(start >= current->start); + for (entry = first; entry != NULL && entry->start < end; + entry = RB_NEXT(uvm_map_addr, &map->addr, entry)) { + amap = entry->aref.ar_amap; /* top layer */ + if (UVM_ET_ISOBJ(entry)) + uobj = entry->object.uvm_obj; + else + uobj = NULL; /* * No amap cleaning necessary if: - * - * (1) There's no amap. - * - * (2) We're not deactivating or freeing pages. + * - there's no amap + * - we're not deactivating or freeing pages. */ - if (amap == NULL || (flags & (PGO_DEACTIVATE|PGO_FREE)) == 0) goto flush_object; - - /* XXX for now, just in case... */ - if (amap_clean_works == 0) + if (!amap_clean_works) goto flush_object; - offset = start - current->start; - size = MIN(end, current->end) - start; - for ( ; size != 0; size -= PAGE_SIZE, offset += PAGE_SIZE) { - anon = amap_lookup(¤t->aref, offset); + cp_start = MAX(entry->start, start); + cp_end = MIN(entry->end, end); + + for (; cp_start != cp_end; cp_start += PAGE_SIZE) { + anon = amap_lookup(&entry->aref, + cp_start - entry->start); if (anon == NULL) continue; - simple_lock(&anon->an_lock); + simple_lock(&anon->an_lock); /* XXX */ pg = anon->an_page; if (pg == NULL) { @@ -2986,23 +4140,21 @@ uvm_map_clean(struct vm_map *map, vaddr_t start, vaddr_t end, int flags) } switch (flags & (PGO_CLEANIT|PGO_FREE|PGO_DEACTIVATE)) { - /* * XXX In these first 3 cases, we always just * XXX deactivate the page. We may want to * XXX handle the different cases more * XXX specifically, in the future. */ - case PGO_CLEANIT|PGO_FREE: case PGO_CLEANIT|PGO_DEACTIVATE: case PGO_DEACTIVATE: - deactivate_it: +deactivate_it: /* skip the page if it's loaned or wired */ if (pg->loan_count != 0 || pg->wire_count != 0) { simple_unlock(&anon->an_lock); - continue; + break; } uvm_lock_pageq(); @@ -3012,51 +4164,45 @@ uvm_map_clean(struct vm_map *map, vaddr_t start, vaddr_t end, int flags) * by the anon (may simply be loaned to the * anon). */ - if ((pg->pg_flags & PQ_ANON) == 0) { KASSERT(pg->uobject == NULL); uvm_unlock_pageq(); simple_unlock(&anon->an_lock); - continue; + break; } KASSERT(pg->uanon == anon); -#ifdef UBC - /* ...and deactivate the page. */ - pmap_clear_reference(pg); -#else /* zap all mappings for the page. */ pmap_page_protect(pg, VM_PROT_NONE); /* ...and deactivate the page. */ -#endif uvm_pagedeactivate(pg); uvm_unlock_pageq(); simple_unlock(&anon->an_lock); - continue; + break; case PGO_FREE: /* - * If there are multiple references to + * If there are mutliple references to * the amap, just deactivate the page. */ - if (amap_refs(amap) > 1) goto deactivate_it; /* XXX skip the page if it's wired */ if (pg->wire_count != 0) { simple_unlock(&anon->an_lock); - continue; + break; } - amap_unadd(¤t->aref, offset); + amap_unadd(&entry->aref, + cp_start - entry->start); refs = --anon->an_ref; simple_unlock(&anon->an_lock); if (refs == 0) uvm_anfree(anon); - continue; + break; default: panic("uvm_map_clean: weird flags"); @@ -3064,817 +4210,819 @@ uvm_map_clean(struct vm_map *map, vaddr_t start, vaddr_t end, int flags) } flush_object: + cp_start = MAX(entry->start, start); + cp_end = MIN(entry->end, end); + /* * flush pages if we've got a valid backing object. * * Don't PGO_FREE if we don't have write permission - * and don't flush if this is a copy-on-write object + * and don't flush if this is a copy-on-write object * since we can't know our permissions on it. */ - - offset = current->offset + (start - current->start); - size = MIN(end, current->end) - start; if (uobj != NULL && ((flags & PGO_FREE) == 0 || ((entry->max_protection & VM_PROT_WRITE) != 0 && (entry->etype & UVM_ET_COPYONWRITE) == 0))) { simple_lock(&uobj->vmobjlock); - rv = uobj->pgops->pgo_flush(uobj, offset, - offset + size, flags); + rv = uobj->pgops->pgo_flush(uobj, + cp_start - entry->start + entry->offset, + cp_end - entry->start + entry->offset, flags); simple_unlock(&uobj->vmobjlock); if (rv == FALSE) error = EFAULT; } - start += size; - } - vm_map_unlock_read(map); - return (error); -} - - -/* - * uvm_map_checkprot: check protection in map - * - * => must allow specified protection in a fully allocated region. - * => map must be read or write locked by caller. - */ - -boolean_t -uvm_map_checkprot(struct vm_map *map, vaddr_t start, vaddr_t end, - vm_prot_t protection) -{ - struct vm_map_entry *entry; - struct vm_map_entry *tmp_entry; - - if (!uvm_map_lookup_entry(map, start, &tmp_entry)) { - return(FALSE); - } - entry = tmp_entry; - while (start < end) { - if (entry == &map->header) { - return(FALSE); - } - - /* - * no holes allowed - */ - - if (start < entry->start) { - return(FALSE); - } - - /* - * check protection associated with entry - */ - - if ((entry->protection & protection) != protection) { - return(FALSE); - } - - /* go to next entry */ - - start = entry->end; - entry = entry->next; } - return(TRUE); -} - -/* - * uvmspace_alloc: allocate a vmspace structure. - * - * - structure includes vm_map and pmap - * - XXX: no locking on this structure - * - refcnt set to 1, rest must be init'd by caller - */ -struct vmspace * -uvmspace_alloc(vaddr_t min, vaddr_t max, boolean_t pageable, - boolean_t remove_holes) -{ - struct vmspace *vm; - vm = pool_get(&uvm_vmspace_pool, PR_WAITOK | PR_ZERO); - uvmspace_init(vm, NULL, min, max, pageable, remove_holes); - return (vm); + vm_map_unlock_read(map); + return error; } /* - * uvmspace_init: initialize a vmspace structure. - * - * - XXX: no locking on this structure - * - refcnt set to 1, rest must be init'd by caller + * UVM_MAP_CLIP_END implementation */ void -uvmspace_init(struct vmspace *vm, struct pmap *pmap, vaddr_t min, vaddr_t max, - boolean_t pageable, boolean_t remove_holes) +uvm_map_clip_end(struct vm_map *map, struct vm_map_entry *entry, vaddr_t addr) { + struct vm_map_entry *tmp; - uvm_map_setup(&vm->vm_map, min, max, pageable ? VM_MAP_PAGEABLE : 0); - - if (pmap) - pmap_reference(pmap); - else - pmap = pmap_create(); - vm->vm_map.pmap = pmap; - - vm->vm_refcnt = 1; + KASSERT(entry->start < addr && VMMAP_FREE_END(entry) > addr); + tmp = uvm_mapent_alloc(map, 0); - if (remove_holes) - pmap_remove_holes(&vm->vm_map); + /* + * Invoke splitentry. + */ + uvm_map_splitentry(map, entry, tmp, addr); } /* - * uvmspace_share: share a vmspace between two processes + * UVM_MAP_CLIP_START implementation * - * - XXX: no locking on vmspace - * - used for vfork and threads + * Clippers are required to not change the pointers to the entry they are + * clipping on. + * Since uvm_map_splitentry turns the original entry into the lowest + * entry (address wise) we do a swap between the new entry and the original + * entry, prior to calling uvm_map_splitentry. */ - void -uvmspace_share(struct proc *p1, struct proc *p2) +uvm_map_clip_start(struct vm_map *map, struct vm_map_entry *entry, vaddr_t addr) { - p2->p_vmspace = p1->p_vmspace; - p1->p_vmspace->vm_refcnt++; + struct vm_map_entry *tmp; + struct uvm_addr_state *free; + + /* Unlink original. */ + free = uvm_map_uaddr_e(map, entry); + if (free) + uvm_mapent_free_remove(map, free, entry); + uvm_mapent_addr_remove(map, entry); + + /* Copy entry. */ + KASSERT(entry->start < addr && VMMAP_FREE_END(entry) > addr); + tmp = uvm_mapent_alloc(map, 0); + uvm_mapent_copy(entry, tmp); + + /* Put new entry in place of original entry. */ + uvm_mapent_addr_insert(map, tmp); + if (free) + uvm_mapent_free_insert(map, free, tmp); + + /* Invoke splitentry. */ + uvm_map_splitentry(map, tmp, entry, addr); } /* - * uvmspace_exec: the process wants to exec a new program - * - * - XXX: no locking on vmspace + * Boundary fixer. */ +static __inline vaddr_t uvm_map_boundfix(vaddr_t, vaddr_t, vaddr_t); +static __inline vaddr_t +uvm_map_boundfix(vaddr_t min, vaddr_t max, vaddr_t bound) +{ + return (min < bound && max > bound) ? bound : max; +} -void -uvmspace_exec(struct proc *p, vaddr_t start, vaddr_t end) +/* + * Choose free list based on address at start of free space. + * + * The uvm_addr_state returned contains addr and is the first of: + * - uaddr_exe + * - uaddr_brk_stack + * - uaddr_any + */ +struct uvm_addr_state* +uvm_map_uaddr(struct vm_map *map, vaddr_t addr) { - struct vmspace *nvm, *ovm = p->p_vmspace; - struct vm_map *map = &ovm->vm_map; + struct uvm_addr_state *uaddr; + int i; - pmap_unuse_final(p); /* before stack addresses go away */ + /* Special case the first page, to prevent mmap from returning 0. */ + if (addr < VMMAP_MIN_ADDR) + return NULL; + + /* Upper bound for kernel maps at uvm_maxkaddr. */ + if ((map->flags & VM_MAP_ISVMSPACE) == 0) { + if (addr >= uvm_maxkaddr) + return NULL; + } + + /* Is the address inside the exe-only map? */ + if (map->uaddr_exe != NULL && addr >= map->uaddr_exe->uaddr_minaddr && + addr < map->uaddr_exe->uaddr_maxaddr) + return map->uaddr_exe; + + /* Check if the space falls inside brk/stack area. */ + if ((addr >= map->b_start && addr < map->b_end) || + (addr >= map->s_start && addr < map->s_end)) { + if (map->uaddr_brk_stack != NULL && + addr >= map->uaddr_brk_stack->uaddr_minaddr && + addr < map->uaddr_brk_stack->uaddr_maxaddr) { + return map->uaddr_brk_stack; + } else + return NULL; + } /* - * see if more than one process is using this vmspace... + * Check the other selectors. + * + * These selectors are only marked as the owner, if they have insert + * functions. */ + for (i = 0; i < nitems(map->uaddr_any); i++) { + uaddr = map->uaddr_any[i]; + if (uaddr == NULL) + continue; + if (uaddr->uaddr_functions->uaddr_free_insert == NULL) + continue; - if (ovm->vm_refcnt == 1) { + if (addr >= uaddr->uaddr_minaddr && + addr < uaddr->uaddr_maxaddr) + return uaddr; + } - /* - * if p is the only process using its vmspace then we can safely - * recycle that vmspace for the program that is being exec'd. - */ + return NULL; +} -#ifdef SYSVSHM - /* - * SYSV SHM semantics require us to kill all segments on an exec - */ - if (ovm->vm_shm) - shmexit(ovm); -#endif +/* + * Choose free list based on address at start of free space. + * + * The uvm_addr_state returned contains addr and is the first of: + * - uaddr_exe + * - uaddr_brk_stack + * - uaddr_any + */ +struct uvm_addr_state* +uvm_map_uaddr_e(struct vm_map *map, struct vm_map_entry *entry) +{ + return uvm_map_uaddr(map, VMMAP_FREE_START(entry)); +} - /* - * POSIX 1003.1b -- "lock future mappings" is revoked - * when a process execs another program image. - */ - vm_map_lock(map); - vm_map_modflags(map, 0, VM_MAP_WIREFUTURE); - vm_map_unlock(map); +/* + * Returns the first free-memory boundary that is crossed by [min-max]. + */ +vsize_t +uvm_map_boundary(struct vm_map *map, vaddr_t min, vaddr_t max) +{ + struct uvm_addr_state *uaddr; + int i; - /* - * now unmap the old program - */ - uvm_unmap(map, map->min_offset, map->max_offset); + /* Never return first page. */ + max = uvm_map_boundfix(min, max, VMMAP_MIN_ADDR); - /* - * but keep MMU holes unavailable - */ - pmap_remove_holes(map); + /* Treat the maxkaddr special, if the map is a kernel_map. */ + if ((map->flags & VM_MAP_ISVMSPACE) == 0) + max = uvm_map_boundfix(min, max, uvm_maxkaddr); - /* - * resize the map - */ - vm_map_lock(map); - map->min_offset = start; - uvm_tree_sanity(map, "resize enter"); - map->max_offset = end; - if (map->header.prev != &map->header) - uvm_rb_fixup(map, map->header.prev); - uvm_tree_sanity(map, "resize leave"); - vm_map_unlock(map); - + /* Check for exe-only boundaries. */ + if (map->uaddr_exe != NULL) { + max = uvm_map_boundfix(min, max, map->uaddr_exe->uaddr_minaddr); + max = uvm_map_boundfix(min, max, map->uaddr_exe->uaddr_maxaddr); + } - } else { + /* Check for exe-only boundaries. */ + if (map->uaddr_brk_stack != NULL) { + max = uvm_map_boundfix(min, max, + map->uaddr_brk_stack->uaddr_minaddr); + max = uvm_map_boundfix(min, max, + map->uaddr_brk_stack->uaddr_maxaddr); + } - /* - * p's vmspace is being shared, so we can't reuse it for p since - * it is still being used for others. allocate a new vmspace - * for p - */ - nvm = uvmspace_alloc(start, end, - (map->flags & VM_MAP_PAGEABLE) ? TRUE : FALSE, TRUE); + /* Check other boundaries. */ + for (i = 0; i < nitems(map->uaddr_any); i++) { + uaddr = map->uaddr_any[i]; + if (uaddr != NULL) { + max = uvm_map_boundfix(min, max, uaddr->uaddr_minaddr); + max = uvm_map_boundfix(min, max, uaddr->uaddr_maxaddr); + } + } - /* - * install new vmspace and drop our ref to the old one. - */ + /* Boundaries at stack and brk() area. */ + max = uvm_map_boundfix(min, max, map->s_start); + max = uvm_map_boundfix(min, max, map->s_end); + max = uvm_map_boundfix(min, max, map->b_start); + max = uvm_map_boundfix(min, max, map->b_end); - pmap_deactivate(p); - p->p_vmspace = nvm; - pmap_activate(p); + return max; +} - uvmspace_free(ovm); +/* + * Update map allocation start and end addresses from proc vmspace. + */ +void +uvm_map_vmspace_update(struct vm_map *map, + struct uvm_map_deadq *dead, int flags) +{ + struct vmspace *vm; + vaddr_t b_start, b_end, s_start, s_end; + + KASSERT(map->flags & VM_MAP_ISVMSPACE); + KASSERT(offsetof(struct vmspace, vm_map) == 0); + + /* + * Derive actual allocation boundaries from vmspace. + */ + vm = (struct vmspace *)map; + b_start = (vaddr_t)vm->vm_daddr; + b_end = b_start + BRKSIZ; + s_start = MIN((vaddr_t)vm->vm_maxsaddr, (vaddr_t)vm->vm_minsaddr); + s_end = MAX((vaddr_t)vm->vm_maxsaddr, (vaddr_t)vm->vm_minsaddr); +#ifdef DIAGNOSTIC + if ((b_start & (vaddr_t)PAGE_MASK) != 0 || + (b_end & (vaddr_t)PAGE_MASK) != 0 || + (s_start & (vaddr_t)PAGE_MASK) != 0 || + (s_end & (vaddr_t)PAGE_MASK) != 0) { + panic("uvm_map_vmspace_update: vmspace %p invalid bounds: " + "b=0x%lx-0x%lx s=0x%lx-0x%lx", + vm, b_start, b_end, s_start, s_end); } +#endif + + if (__predict_true(map->b_start == b_start && map->b_end == b_end && + map->s_start == s_start && map->s_end == s_end)) + return; + + uvm_map_freelist_update(map, dead, b_start, b_end, + s_start, s_end, flags); } /* - * uvmspace_free: free a vmspace data structure + * Grow kernel memory. * - * - XXX: no locking on vmspace + * This function is only called for kernel maps when an allocation fails. + * + * If the map has a gap that is large enough to accomodate alloc_sz, this + * function will make sure map->free will include it. */ - void -uvmspace_free(struct vmspace *vm) +uvm_map_kmem_grow(struct vm_map *map, struct uvm_map_deadq *dead, + vsize_t alloc_sz, int flags) { - struct vm_map_entry *dead_entries; + vsize_t sz; + vaddr_t end; + struct vm_map_entry *entry; - if (--vm->vm_refcnt == 0) { - /* - * lock the map, to wait out all other references to it. delete - * all of the mappings and pages they hold, then call the pmap - * module to reclaim anything left. - */ -#ifdef SYSVSHM - /* Get rid of any SYSV shared memory segments. */ - if (vm->vm_shm != NULL) - shmexit(vm); + /* Kernel memory only. */ + KASSERT((map->flags & VM_MAP_ISVMSPACE) == 0); + /* Destroy free list. */ + uvm_map_freelist_update_clear(map, dead); + + /* + * Include the guard page in the hard minimum requirement of alloc_sz. + */ + if (map->flags & VM_MAP_GUARDPAGES) + alloc_sz += PAGE_SIZE; + + /* + * Grow by ALLOCMUL * alloc_sz, but at least VM_MAP_KSIZE_DELTA. + * + * Don't handle the case where the multiplication overflows: + * if that happens, the allocation is probably too big anyway. + */ + sz = MAX(VM_MAP_KSIZE_ALLOCMUL * alloc_sz, VM_MAP_KSIZE_DELTA); + + /* + * Walk forward until a gap large enough for alloc_sz shows up. + * + * We assume the kernel map has no boundaries. + * uvm_maxkaddr may be zero. + */ + end = MAX(uvm_maxkaddr, map->min_offset); + entry = uvm_map_entrybyaddr(&map->addr, end); + while (entry && entry->fspace < alloc_sz) + entry = RB_NEXT(uvm_map_addr, &map->addr, entry); + if (entry) { + end = MAX(VMMAP_FREE_START(entry), end); + end += MIN(sz, map->max_offset - end); + } else + end = map->max_offset; + + /* Reserve pmap entries. */ +#ifdef PMAP_GROWKERNEL + uvm_maxkaddr = pmap_growkernel(end); +#else + uvm_maxkaddr = end; #endif - vm_map_lock(&vm->vm_map); - if (vm->vm_map.nentries) { - uvm_unmap_remove(&vm->vm_map, - vm->vm_map.min_offset, vm->vm_map.max_offset, - &dead_entries, NULL, TRUE); - if (dead_entries != NULL) - uvm_unmap_detach(dead_entries, 0); - } - pmap_destroy(vm->vm_map.pmap); - vm->vm_map.pmap = NULL; - pool_put(&uvm_vmspace_pool, vm); - } + + printf("uvm_km_kmem_grow: grown to 0x%lx\n", uvm_maxkaddr); + + /* Rebuild free list. */ + uvm_map_freelist_update_refill(map, flags); } /* - * uvm_map_create: create map + * Freelist update subfunction: unlink all entries from freelists. */ -vm_map_t -uvm_map_create(pmap_t pmap, vaddr_t min, vaddr_t max, int flags) +void +uvm_map_freelist_update_clear(struct vm_map *map, struct uvm_map_deadq *dead) { - vm_map_t result; - - result = malloc(sizeof(struct vm_map), M_VMMAP, M_WAITOK); - uvm_map_setup(result, min, max, flags); - result->pmap = pmap; - return(result); + struct uvm_addr_state *free; + struct vm_map_entry *entry, *prev, *next; + + prev = NULL; + for (entry = RB_MIN(uvm_map_addr, &map->addr); entry != NULL; + entry = next) { + next = RB_NEXT(uvm_map_addr, &map->addr, entry); + + free = uvm_map_uaddr_e(map, entry); + if (free) + uvm_mapent_free_remove(map, free, entry); + + if (prev != NULL && entry->start == entry->end) { + prev->fspace += VMMAP_FREE_END(entry) - entry->end; + uvm_mapent_addr_remove(map, entry); + DEAD_ENTRY_PUSH(dead, entry); + } else + prev = entry; + } } /* - * uvm_map_setup: init map - * - * => map must not be in service yet. + * Freelist update subfunction: refill the freelists with entries. */ void -uvm_map_setup(vm_map_t map, vaddr_t min, vaddr_t max, int flags) +uvm_map_freelist_update_refill(struct vm_map *map, int flags) { + struct vm_map_entry *entry; + vaddr_t min, max; - RB_INIT(&map->rbhead); - map->header.next = map->header.prev = &map->header; - map->nentries = 0; - map->size = 0; - map->ref_count = 1; - map->min_offset = min; - map->max_offset = max; - map->flags = flags; - map->first_free = &map->header; - map->hint = &map->header; - map->timestamp = 0; - rw_init(&map->lock, "vmmaplk"); - simple_lock_init(&map->ref_lock); - simple_lock_init(&map->hint_lock); -} + RB_FOREACH(entry, uvm_map_addr, &map->addr) { + min = VMMAP_FREE_START(entry); + max = VMMAP_FREE_END(entry); + entry->fspace = 0; + entry = uvm_map_fix_space(map, entry, min, max, flags); + } + uvm_tree_sanity(map, __FILE__, __LINE__); +} /* - * uvm_map_reference: add reference to a map - * - * => map need not be locked (we use ref_lock). + * Change {a,b}_{start,end} allocation ranges and associated free lists. */ void -uvm_map_reference(vm_map_t map) +uvm_map_freelist_update(struct vm_map *map, struct uvm_map_deadq *dead, + vaddr_t b_start, vaddr_t b_end, vaddr_t s_start, vaddr_t s_end, int flags) { - simple_lock(&map->ref_lock); - map->ref_count++; - simple_unlock(&map->ref_lock); + KDASSERT(b_end >= b_start && s_end >= s_start); + + /* Clear all free lists. */ + uvm_map_freelist_update_clear(map, dead); + + /* Apply new bounds. */ + map->b_start = b_start; + map->b_end = b_end; + map->s_start = s_start; + map->s_end = s_end; + + /* Refill free lists. */ + uvm_map_freelist_update_refill(map, flags); } /* - * uvm_map_deallocate: drop reference to a map + * Assign a uvm_addr_state to the specified pointer in vm_map. * - * => caller must not lock map - * => we will zap map if ref count goes to zero + * May sleep. */ void -uvm_map_deallocate(vm_map_t map) +uvm_map_set_uaddr(struct vm_map *map, struct uvm_addr_state **which, + struct uvm_addr_state *newval) { - int c; + struct uvm_map_deadq dead; - simple_lock(&map->ref_lock); - c = --map->ref_count; - simple_unlock(&map->ref_lock); - if (c > 0) { - return; - } + /* Pointer which must be in this map. */ + KASSERT(which != NULL); + KASSERT((void*)map <= (void*)(which) && + (void*)(which) < (void*)(map + 1)); - /* - * all references gone. unmap and free. - */ + vm_map_lock(map); + TAILQ_INIT(&dead); + uvm_map_freelist_update_clear(map, &dead); - uvm_unmap(map, map->min_offset, map->max_offset); - pmap_destroy(map->pmap); - free(map, M_VMMAP); + uvm_addr_destroy(*which); + *which = newval; + + uvm_map_freelist_update_refill(map, 0); + vm_map_unlock(map); + uvm_unmap_detach(&dead, 0); } /* - * F O R K - m a i n e n t r y p o i n t - */ -/* - * uvmspace_fork: fork a process' main map + * Correct space insert. * - * => create a new vmspace for child process from parent. - * => parent's map must not be locked. + * Entry must not be on any freelist. */ - -struct vmspace * -uvmspace_fork(struct vmspace *vm1) +struct vm_map_entry* +uvm_map_fix_space(struct vm_map *map, struct vm_map_entry *entry, + vaddr_t min, vaddr_t max, int flags) { - struct vmspace *vm2; - struct vm_map *old_map = &vm1->vm_map; - struct vm_map *new_map; - struct vm_map_entry *old_entry; - struct vm_map_entry *new_entry; - pmap_t new_pmap; - boolean_t protect_child; - - vm_map_lock(old_map); - - vm2 = uvmspace_alloc(old_map->min_offset, old_map->max_offset, - (old_map->flags & VM_MAP_PAGEABLE) ? TRUE : FALSE, FALSE); - memcpy(&vm2->vm_startcopy, &vm1->vm_startcopy, - (caddr_t) (vm1 + 1) - (caddr_t) &vm1->vm_startcopy); - new_map = &vm2->vm_map; /* XXX */ - new_pmap = new_map->pmap; + struct uvm_addr_state *free, *entfree; + vaddr_t lmax; - old_entry = old_map->header.next; + KASSERT(entry == NULL || (entry->etype & UVM_ET_FREEMAPPED) == 0); + KDASSERT(min <= max); + KDASSERT((entry != NULL && VMMAP_FREE_END(entry) == min) || + min == map->min_offset); /* - * go entry-by-entry + * During the function, entfree will always point at the uaddr state + * for entry. */ + entfree = (entry == NULL ? NULL : + uvm_map_uaddr_e(map, entry)); - while (old_entry != &old_map->header) { - + while (min != max) { /* - * first, some sanity checks on the old entry + * Claim guard page for entry. */ - if (UVM_ET_ISSUBMAP(old_entry)) - panic("fork: encountered a submap during fork (illegal)"); - - if (!UVM_ET_ISCOPYONWRITE(old_entry) && - UVM_ET_ISNEEDSCOPY(old_entry)) - panic("fork: non-copy_on_write map entry marked needs_copy (illegal)"); - - - switch (old_entry->inheritance) { - case MAP_INHERIT_NONE: - /* - * drop the mapping - */ - break; - - case MAP_INHERIT_SHARE: - /* - * share the mapping: this means we want the old and - * new entries to share amaps and backing objects. - */ - - /* - * if the old_entry needs a new amap (due to prev fork) - * then we need to allocate it now so that we have - * something we own to share with the new_entry. [in - * other words, we need to clear needs_copy] - */ - - if (UVM_ET_ISNEEDSCOPY(old_entry)) { - /* get our own amap, clears needs_copy */ - amap_copy(old_map, old_entry, M_WAITOK, FALSE, - 0, 0); - /* XXXCDC: WAITOK??? */ + if ((map->flags & VM_MAP_GUARDPAGES) && entry != NULL && + VMMAP_FREE_END(entry) == entry->end && + entry->start != entry->end) { + if (max - min == 2 * PAGE_SIZE) { + /* + * If the free-space gap is exactly 2 pages, + * we make the guard 2 pages instead of 1. + * Because in a guarded map, an area needs + * at least 2 pages to allocate from: + * one page for the allocation and one for + * the guard. + */ + entry->guard = 2 * PAGE_SIZE; + min = max; + } else { + entry->guard = PAGE_SIZE; + min += PAGE_SIZE; } + continue; + } - new_entry = uvm_mapent_alloc(new_map, 0); - /* old_entry -> new_entry */ - uvm_mapent_copy(old_entry, new_entry); - - /* new pmap has nothing wired in it */ - new_entry->wired_count = 0; - - /* - * gain reference to object backing the map (can't - * be a submap, already checked this case). - */ - if (new_entry->aref.ar_amap) - /* share reference */ - uvm_map_reference_amap(new_entry, AMAP_SHARED); - - if (new_entry->object.uvm_obj && - new_entry->object.uvm_obj->pgops->pgo_reference) - new_entry->object.uvm_obj-> - pgops->pgo_reference( - new_entry->object.uvm_obj); - - /* insert entry at end of new_map's entry list */ - uvm_map_entry_link(new_map, new_map->header.prev, - new_entry); - - /* - * pmap_copy the mappings: this routine is optional - * but if it is there it will reduce the number of - * page faults in the new proc. - */ - - pmap_copy(new_pmap, old_map->pmap, new_entry->start, - (old_entry->end - old_entry->start), - old_entry->start); - - break; + /* + * Handle the case where entry has a 2-page guard, but the + * space after entry is freed. + */ + if (entry != NULL && entry->fspace == 0 && + entry->guard > PAGE_SIZE) { + entry->guard = PAGE_SIZE; + min = VMMAP_FREE_START(entry); + } - case MAP_INHERIT_COPY: + lmax = uvm_map_boundary(map, min, max); + free = uvm_map_uaddr(map, min); + /* + * Entries are merged if they point at the same uvm_free(). + * Exception to that rule: if min == uvm_maxkaddr, a new + * entry is started regardless (otherwise the allocators + * will get confused). + */ + if (entry != NULL && free == entfree && + !((map->flags & VM_MAP_ISVMSPACE) == 0 && + min == uvm_maxkaddr)) { + KDASSERT(VMMAP_FREE_END(entry) == min); + entry->fspace += lmax - min; + } else { /* - * copy-on-write the mapping (using mmap's - * MAP_PRIVATE semantics) - * - * allocate new_entry, adjust reference counts. - * (note that new references are read-only). + * Commit entry to free list: it'll not be added to + * anymore. + * We'll start a new entry and add to that entry + * instead. */ + if (entry != NULL && entfree != NULL) + uvm_mapent_free_insert(map, entfree, entry); + + /* New entry for new uaddr. */ + entry = uvm_mapent_alloc(map, flags); + KDASSERT(entry != NULL); + entry->end = entry->start = min; + entry->guard = 0; + entry->fspace = lmax - min; + entry->object.uvm_obj = NULL; + entry->offset = 0; + entry->etype = 0; + entry->protection = entry->max_protection = 0; + entry->inheritance = 0; + entry->wired_count = 0; + entry->advice = 0; + entry->aref.ar_pageoff = 0; + entry->aref.ar_amap = NULL; + uvm_mapent_addr_insert(map, entry); + + entfree = free; + } - new_entry = uvm_mapent_alloc(new_map, 0); - /* old_entry -> new_entry */ - uvm_mapent_copy(old_entry, new_entry); - - if (new_entry->aref.ar_amap) - uvm_map_reference_amap(new_entry, 0); - - if (new_entry->object.uvm_obj && - new_entry->object.uvm_obj->pgops->pgo_reference) - new_entry->object.uvm_obj->pgops->pgo_reference - (new_entry->object.uvm_obj); + min = lmax; + } + /* Finally put entry on the uaddr state. */ + if (entry != NULL && entfree != NULL) + uvm_mapent_free_insert(map, entfree, entry); - /* new pmap has nothing wired in it */ - new_entry->wired_count = 0; + return entry; +} - new_entry->etype |= - (UVM_ET_COPYONWRITE|UVM_ET_NEEDSCOPY); - uvm_map_entry_link(new_map, new_map->header.prev, - new_entry); +/* + * MQuery style of allocation. + * + * This allocator searches forward until sufficient space is found to map + * the given size. + * + * XXX: factor in offset (via pmap_prefer) and protection? + */ +int +uvm_map_mquery(struct vm_map *map, vaddr_t *addr_p, vsize_t sz, voff_t offset, + int flags) +{ + struct vm_map_entry *entry, *last; + vaddr_t addr; + vaddr_t tmp, pmap_align, pmap_offset; + int error; - /* - * the new entry will need an amap. it will either - * need to be copied from the old entry or created - * from scratch (if the old entry does not have an - * amap). can we defer this process until later - * (by setting "needs_copy") or do we need to copy - * the amap now? - * - * we must copy the amap now if any of the following - * conditions hold: - * 1. the old entry has an amap and that amap is - * being shared. this means that the old (parent) - * process is sharing the amap with another - * process. if we do not clear needs_copy here - * we will end up in a situation where both the - * parent and child process are referring to the - * same amap with "needs_copy" set. if the - * parent write-faults, the fault routine will - * clear "needs_copy" in the parent by allocating - * a new amap. this is wrong because the - * parent is supposed to be sharing the old amap - * and the new amap will break that. - * - * 2. if the old entry has an amap and a non-zero - * wire count then we are going to have to call - * amap_cow_now to avoid page faults in the - * parent process. since amap_cow_now requires - * "needs_copy" to be clear we might as well - * clear it here as well. - * - */ + addr = *addr_p; + vm_map_lock_read(map); - if (old_entry->aref.ar_amap != NULL) { + /* + * Configure pmap prefer. + */ + if (offset != UVM_UNKNOWN_OFFSET) { + pmap_align = MAX(PAGE_SIZE, PMAP_PREFER_ALIGN()); + pmap_offset = PMAP_PREFER_OFFSET(offset); + } else { + pmap_align = PAGE_SIZE; + pmap_offset = 0; + } - if ((amap_flags(old_entry->aref.ar_amap) & - AMAP_SHARED) != 0 || - VM_MAPENT_ISWIRED(old_entry)) { + /* + * Align address to pmap_prefer unless FLAG_FIXED is set. + */ + if (!(flags & UVM_FLAG_FIXED) && offset != UVM_UNKNOWN_OFFSET) { + tmp = (addr & ~(pmap_align - 1)) | pmap_offset; + if (tmp < addr) + tmp += pmap_align; + addr = tmp; + } - amap_copy(new_map, new_entry, M_WAITOK, FALSE, - 0, 0); - /* XXXCDC: M_WAITOK ... ok? */ - } - } + /* + * First, check if the requested range is fully available. + */ + entry = uvm_map_entrybyaddr(&map->addr, addr); + last = NULL; + if (uvm_map_isavail(map, NULL, &entry, &last, addr, sz)) { + error = 0; + goto out; + } + if (flags & UVM_FLAG_FIXED) { + error = EINVAL; + goto out; + } - /* - * if the parent's entry is wired down, then the - * parent process does not want page faults on - * access to that memory. this means that we - * cannot do copy-on-write because we can't write - * protect the old entry. in this case we - * resolve all copy-on-write faults now, using - * amap_cow_now. note that we have already - * allocated any needed amap (above). - */ + error = ENOMEM; /* Default error from here. */ - if (VM_MAPENT_ISWIRED(old_entry)) { - - /* - * resolve all copy-on-write faults now - * (note that there is nothing to do if - * the old mapping does not have an amap). - * XXX: is it worthwhile to bother with pmap_copy - * in this case? - */ - if (old_entry->aref.ar_amap) - amap_cow_now(new_map, new_entry); - - } else { - - /* - * setup mappings to trigger copy-on-write faults - * we must write-protect the parent if it has - * an amap and it is not already "needs_copy"... - * if it is already "needs_copy" then the parent - * has already been write-protected by a previous - * fork operation. - * - * if we do not write-protect the parent, then - * we must be sure to write-protect the child - * after the pmap_copy() operation. - * - * XXX: pmap_copy should have some way of telling - * us that it didn't do anything so we can avoid - * calling pmap_protect needlessly. - */ - - if (old_entry->aref.ar_amap) { - - if (!UVM_ET_ISNEEDSCOPY(old_entry)) { - if (old_entry->max_protection & VM_PROT_WRITE) { - pmap_protect(old_map->pmap, - old_entry->start, - old_entry->end, - old_entry->protection & - ~VM_PROT_WRITE); - pmap_update(old_map->pmap); - - } - old_entry->etype |= UVM_ET_NEEDSCOPY; - } - - /* - * parent must now be write-protected - */ - protect_child = FALSE; - } else { - - /* - * we only need to protect the child if the - * parent has write access. - */ - if (old_entry->max_protection & VM_PROT_WRITE) - protect_child = TRUE; - else - protect_child = FALSE; - - } - - /* - * copy the mappings - * XXX: need a way to tell if this does anything - */ - - pmap_copy(new_pmap, old_map->pmap, - new_entry->start, - (old_entry->end - old_entry->start), - old_entry->start); - - /* - * protect the child's mappings if necessary - */ - if (protect_child) { - pmap_protect(new_pmap, new_entry->start, - new_entry->end, - new_entry->protection & - ~VM_PROT_WRITE); - } + /* + * At this point, the memory at is not available. + * The reasons are: + * [1] it's outside the map, + * [2] it starts in used memory (and therefore needs to move + * toward the first free page in entry), + * [3] it starts in free memory but bumps into used memory. + * + * Note that for case [2], the forward moving is handled by the + * for loop below. + */ - } - break; - } /* end of switch statement */ - old_entry = old_entry->next; + if (entry == NULL) { + /* [1] Outside the map. */ + if (addr >= map->max_offset) + goto out; + else + entry = RB_MIN(uvm_map_addr, &map->addr); + } else if (VMMAP_FREE_START(entry) <= addr) { + /* [3] Bumped into used memory. */ + entry = RB_NEXT(uvm_map_addr, &map->addr, entry); } - new_map->size = old_map->size; - vm_map_unlock(old_map); + /* + * Test if the next entry is sufficient for the allocation. + */ + for (; entry != NULL; + entry = RB_NEXT(uvm_map_addr, &map->addr, entry)) { + if (entry->fspace == 0) + continue; + addr = VMMAP_FREE_START(entry); -#ifdef SYSVSHM - if (vm1->vm_shm) - shmfork(vm1, vm2); -#endif +restart: /* Restart address checks on address change. */ -#ifdef PMAP_FORK - pmap_fork(vm1->vm_map.pmap, vm2->vm_map.pmap); -#endif + tmp = (addr & ~(pmap_align - 1)) | pmap_offset; + if (tmp < addr) + tmp += pmap_align; + addr = tmp; + if (addr >= VMMAP_FREE_END(entry)) + continue; - return(vm2); -} + /* + * Skip brk() allocation addresses. + */ + if (addr + sz > map->b_start && addr < map->b_end) { + if (VMMAP_FREE_END(entry) > map->b_end) { + addr = map->b_end; + goto restart; + } else + continue; + } + /* + * Skip stack allocation addresses. + */ + if (addr + sz > map->s_start && addr < map->s_end) { + if (VMMAP_FREE_END(entry) > map->s_end) { + addr = map->s_end; + goto restart; + } else + continue; + } -#if defined(DDB) + last = NULL; + if (uvm_map_isavail(map, NULL, &entry, &last, addr, sz)) { + error = 0; + goto out; + } + } -/* - * DDB hooks - */ +out: + vm_map_unlock_read(map); + if (error == 0) + *addr_p = addr; + return error; +} /* - * uvm_map_printit: actually prints the map + * Determine allocation bias. + * + * Returns 1 if we should bias to high addresses, -1 for a bias towards low + * addresses, or 0 for no bias. + * The bias mechanism is intended to avoid clashing with brk() and stack + * areas. */ - -void -uvm_map_printit(struct vm_map *map, boolean_t full, - int (*pr)(const char *, ...)) +int +uvm_mapent_bias(struct vm_map *map, struct vm_map_entry *entry) { - struct vm_map_entry *entry; + vaddr_t start, end; - (*pr)("MAP %p: [0x%lx->0x%lx]\n", map, map->min_offset,map->max_offset); - (*pr)("\t#ent=%d, sz=%u, ref=%d, version=%u, flags=0x%x\n", - map->nentries, map->size, map->ref_count, map->timestamp, - map->flags); -#ifdef pmap_resident_count - (*pr)("\tpmap=%p(resident=%d)\n", map->pmap, - pmap_resident_count(map->pmap)); + start = VMMAP_FREE_START(entry); + end = VMMAP_FREE_END(entry); + + /* + * Stay at the top of brk() area. + */ + if (end >= map->b_start && start < map->b_end) + return 1; + /* + * Stay at the far end of the stack area. + */ + if (end >= map->s_start && start < map->s_end) { +#ifdef MACHINE_STACK_GROWS_UP + return 1; #else - /* XXXCDC: this should be required ... */ - (*pr)("\tpmap=%p(resident=<>)\n", map->pmap); + return -1; #endif - if (!full) - return; - for (entry = map->header.next; entry != &map->header; - entry = entry->next) { - (*pr)(" - %p: 0x%lx->0x%lx: obj=%p/0x%llx, amap=%p/%d\n", - entry, entry->start, entry->end, entry->object.uvm_obj, - (long long)entry->offset, entry->aref.ar_amap, - entry->aref.ar_pageoff); - (*pr)( - "\tsubmap=%c, cow=%c, nc=%c, prot(max)=%d/%d, inh=%d, " - "wc=%d, adv=%d\n", - (entry->etype & UVM_ET_SUBMAP) ? 'T' : 'F', - (entry->etype & UVM_ET_COPYONWRITE) ? 'T' : 'F', - (entry->etype & UVM_ET_NEEDSCOPY) ? 'T' : 'F', - entry->protection, entry->max_protection, - entry->inheritance, entry->wired_count, entry->advice); } -} -/* - * uvm_object_printit: actually prints the object - */ + /* + * No bias, this area is meant for us. + */ + return 0; +} -void -uvm_object_printit(struct uvm_object *uobj, boolean_t full, - int (*pr)(const char *, ...)) -{ - struct vm_page *pg; - int cnt = 0; - (*pr)("OBJECT %p: pgops=%p, npages=%d, ", - uobj, uobj->pgops, uobj->uo_npages); - if (UVM_OBJ_IS_KERN_OBJECT(uobj)) - (*pr)("refs=\n"); - else - (*pr)("refs=%d\n", uobj->uo_refs); +boolean_t +vm_map_lock_try_ln(struct vm_map *map, char *file, int line) +{ + boolean_t rv; - if (!full) { - return; - } - (*pr)(" PAGES :\n "); - RB_FOREACH(pg, uvm_objtree, &uobj->memt) { - (*pr)("<%p,0x%llx> ", pg, (long long)pg->offset); - if ((cnt % 3) == 2) { - (*pr)("\n "); + if (map->flags & VM_MAP_INTRSAFE) { + rv = TRUE; + } else { + if (map->flags & VM_MAP_BUSY) { + return (FALSE); } - cnt++; + rv = (rw_enter(&map->lock, RW_WRITE|RW_NOSLEEP) == 0); } - if ((cnt % 3) != 2) { - (*pr)("\n"); + + if (rv) { + map->timestamp++; + LPRINTF(("map lock: %p (at %s %d)\n", map, file, line)); + uvm_tree_sanity(map, file, line); + uvm_tree_size_chk(map, file, line); } -} -/* - * uvm_page_printit: actually print the page - */ + return (rv); +} -static const char page_flagbits[] = - "\20\1BUSY\2WANTED\3TABLED\4CLEAN\5CLEANCHK\6RELEASED\7FAKE\10RDONLY" - "\11ZERO\15PAGER1\20FREE\21INACTIVE\22ACTIVE\24ENCRYPT\30PMAP0" - "\31PMAP1\32PMAP2\33PMAP3"; +void +vm_map_lock_ln(struct vm_map *map, char *file, int line) +{ + if ((map->flags & VM_MAP_INTRSAFE) == 0) { + do { + while (map->flags & VM_MAP_BUSY) { + map->flags |= VM_MAP_WANTLOCK; + tsleep(&map->flags, PVM, (char *)vmmapbsy, 0); + } + } while (rw_enter(&map->lock, RW_WRITE|RW_SLEEPFAIL) != 0); + } + + map->timestamp++; + LPRINTF(("map lock: %p (at %s %d)\n", map, file, line)); + uvm_tree_sanity(map, file, line); + uvm_tree_size_chk(map, file, line); +} void -uvm_page_printit(struct vm_page *pg, boolean_t full, - int (*pr)(const char *, ...)) +vm_map_lock_read_ln(struct vm_map *map, char *file, int line) { - struct vm_page *tpg; - struct uvm_object *uobj; - struct pglist *pgl; + if ((map->flags & VM_MAP_INTRSAFE) == 0) + rw_enter_read(&map->lock); + LPRINTF(("map lock: %p (at %s %d)\n", map, file, line)); + uvm_tree_sanity(map, file, line); + uvm_tree_size_chk(map, file, line); +} - (*pr)("PAGE %p:\n", pg); - (*pr)(" flags=%b, vers=%d, wire_count=%d, pa=0x%llx\n", - pg->pg_flags, page_flagbits, pg->pg_version, pg->wire_count, - (long long)pg->phys_addr); - (*pr)(" uobject=%p, uanon=%p, offset=0x%llx loan_count=%d\n", - pg->uobject, pg->uanon, (long long)pg->offset, pg->loan_count); -#if defined(UVM_PAGE_TRKOWN) - if (pg->pg_flags & PG_BUSY) - (*pr)(" owning process = %d, tag=%s", - pg->owner, pg->owner_tag); - else - (*pr)(" page not busy, no owner"); -#else - (*pr)(" [page ownership tracking disabled]"); -#endif -#ifdef __HAVE_VM_PAGE_MD - (*pr)("\tvm_page_md %p\n", &pg->mdpage); -#else - (*pr)("\n"); -#endif +void +vm_map_unlock_ln(struct vm_map *map, char *file, int line) +{ + uvm_tree_sanity(map, file, line); + uvm_tree_size_chk(map, file, line); + LPRINTF(("map unlock: %p (at %s %d)\n", map, file, line)); + if ((map->flags & VM_MAP_INTRSAFE) == 0) + rw_exit(&map->lock); +} - if (!full) - return; +void +vm_map_unlock_read_ln(struct vm_map *map, char *file, int line) +{ + /* XXX: RO */ uvm_tree_sanity(map, file, line); + /* XXX: RO */ uvm_tree_size_chk(map, file, line); + LPRINTF(("map unlock: %p (at %s %d)\n", map, file, line)); + if ((map->flags & VM_MAP_INTRSAFE) == 0) + rw_exit_read(&map->lock); +} - /* cross-verify object/anon */ - if ((pg->pg_flags & PQ_FREE) == 0) { - if (pg->pg_flags & PQ_ANON) { - if (pg->uanon == NULL || pg->uanon->an_page != pg) - (*pr)(" >>> ANON DOES NOT POINT HERE <<< (%p)\n", - (pg->uanon) ? pg->uanon->an_page : NULL); - else - (*pr)(" anon backpointer is OK\n"); - } else { - uobj = pg->uobject; - if (uobj) { - (*pr)(" checking object list\n"); - RB_FOREACH(tpg, uvm_objtree, &uobj->memt) { - if (tpg == pg) { - break; - } - } - if (tpg) - (*pr)(" page found on object list\n"); - else - (*pr)(" >>> PAGE NOT FOUND ON OBJECT LIST! <<<\n"); - } - } - } +void +vm_map_downgrade_ln(struct vm_map *map, char *file, int line) +{ + uvm_tree_sanity(map, file, line); + uvm_tree_size_chk(map, file, line); + LPRINTF(("map unlock: %p (at %s %d)\n", map, file, line)); + LPRINTF(("map lock: %p (at %s %d)\n", map, file, line)); + if ((map->flags & VM_MAP_INTRSAFE) == 0) + rw_enter(&map->lock, RW_DOWNGRADE); +} - /* cross-verify page queue */ - if (pg->pg_flags & PQ_FREE) { - if (uvm_pmr_isfree(pg)) - printf(" page found in uvm_pmemrange\n"); - else - printf(" >>> page not found in uvm_pmemrange <<<\n"); - pgl = NULL; - } else if (pg->pg_flags & PQ_INACTIVE) { - pgl = (pg->pg_flags & PQ_SWAPBACKED) ? - &uvm.page_inactive_swp : &uvm.page_inactive_obj; - } else if (pg->pg_flags & PQ_ACTIVE) { - pgl = &uvm.page_active; - } else { - pgl = NULL; +void +vm_map_upgrade_ln(struct vm_map *map, char *file, int line) +{ + /* XXX: RO */ uvm_tree_sanity(map, file, line); + /* XXX: RO */ uvm_tree_size_chk(map, file, line); + LPRINTF(("map unlock: %p (at %s %d)\n", map, file, line)); + if ((map->flags & VM_MAP_INTRSAFE) == 0) { + rw_exit_read(&map->lock); + rw_enter_write(&map->lock); } + LPRINTF(("map lock: %p (at %s %d)\n", map, file, line)); + uvm_tree_sanity(map, file, line); +} - if (pgl) { - (*pr)(" checking pageq list\n"); - TAILQ_FOREACH(tpg, pgl, pageq) { - if (tpg == pg) { - break; - } - } - if (tpg) - (*pr)(" page found on pageq list\n"); - else - (*pr)(" >>> PAGE NOT FOUND ON PAGEQ LIST! <<<\n"); - } +void +vm_map_busy_ln(struct vm_map *map, char *file, int line) +{ + map->flags |= VM_MAP_BUSY; } -#endif + +void +vm_map_unbusy_ln(struct vm_map *map, char *file, int line) +{ + int oflags; + + oflags = map->flags; + map->flags &= ~(VM_MAP_BUSY|VM_MAP_WANTLOCK); + if (oflags & VM_MAP_WANTLOCK) + wakeup(&map->flags); +} + + +RB_GENERATE(uvm_map_addr, vm_map_entry, daddrs.addr_entry, + uvm_mapentry_addrcmp); diff --git a/sys/uvm/uvm_map.h b/sys/uvm/uvm_map.h index c416cc51d23..e0e21267e31 100644 --- a/sys/uvm/uvm_map.h +++ b/sys/uvm/uvm_map.h @@ -1,7 +1,22 @@ -/* $OpenBSD: uvm_map.h,v 1.46 2011/06/06 17:10:23 ariane Exp $ */ +/* $OpenBSD: uvm_map.h,v 1.47 2012/03/09 13:01:29 ariane Exp $ */ /* $NetBSD: uvm_map.h,v 1.24 2001/02/18 21:19:08 chs Exp $ */ -/* +/* + * Copyright (c) 2011 Ariane van der Steldt + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * * Copyright (c) 1997 Charles D. Cranor and Washington University. * Copyright (c) 1991, 1993, The Regents of the University of California. * @@ -74,6 +89,16 @@ #ifdef _KERNEL +/* + * Internal functions. + * + * Required by clipping macros. + */ +void uvm_map_clip_end(struct vm_map*, struct vm_map_entry*, + vaddr_t); +void uvm_map_clip_start(struct vm_map*, + struct vm_map_entry*, vaddr_t); + /* * UVM_MAP_CLIP_START: ensure that the entry begins at or after * the starting address, if it doesn't we split the entry. @@ -81,8 +106,12 @@ * => map must be locked by caller */ -#define UVM_MAP_CLIP_START(MAP,ENTRY,VA) { \ - if ((VA) > (ENTRY)->start) uvm_map_clip_start(MAP,ENTRY,VA); } +#define UVM_MAP_CLIP_START(_map, _entry, _addr) \ + do { \ + KASSERT((_entry)->end + (_entry)->fspace > (_addr)); \ + if ((_entry)->start < (_addr)) \ + uvm_map_clip_start((_map), (_entry), (_addr)); \ + } while (0) /* * UVM_MAP_CLIP_END: ensure that the entry ends at or before @@ -91,15 +120,16 @@ * => map must be locked by caller */ -#define UVM_MAP_CLIP_END(MAP,ENTRY,VA) { \ - if ((VA) < (ENTRY)->end) uvm_map_clip_end(MAP,ENTRY,VA); } +#define UVM_MAP_CLIP_END(_map, _entry, _addr) \ + do { \ + KASSERT((_entry)->start < (_addr)); \ + if ((_entry)->end > (_addr)) \ + uvm_map_clip_end((_map), (_entry), (_addr)); \ + } while (0) /* * extract flags */ -#define UVM_EXTRACT_REMOVE 0x1 /* remove mapping from old map */ -#define UVM_EXTRACT_CONTIG 0x2 /* try to keep it contig */ -#define UVM_EXTRACT_QREF 0x4 /* use quick refs */ #define UVM_EXTRACT_FIXPROT 0x8 /* set prot to maxprot as we go */ #endif /* _KERNEL */ @@ -133,21 +163,34 @@ union vm_map_object { * Also included is control information for virtual copy operations. */ struct vm_map_entry { - RB_ENTRY(vm_map_entry) rb_entry; /* tree information */ - vaddr_t ownspace; /* free space after */ - vaddr_t space; /* space in subtree */ - struct vm_map_entry *prev; /* previous entry */ - struct vm_map_entry *next; /* next entry */ + union { + RB_ENTRY(vm_map_entry) addr_entry; /* address tree */ + } daddrs; + + union { + RB_ENTRY(vm_map_entry) rbtree; /* Link freespace tree. */ + TAILQ_ENTRY(vm_map_entry) tailq;/* Link freespace queue. */ + TAILQ_ENTRY(vm_map_entry) deadq;/* dead entry queue */ + } dfree; + +#define uvm_map_entry_start_copy start vaddr_t start; /* start address */ vaddr_t end; /* end address */ + + vsize_t guard; /* bytes in guard */ + vsize_t fspace; /* free space */ + union vm_map_object object; /* object I point to */ voff_t offset; /* offset into object */ + struct vm_aref aref; /* anonymous overlay */ + int etype; /* entry type */ + vm_prot_t protection; /* protection code */ vm_prot_t max_protection; /* maximum protection */ vm_inherit_t inheritance; /* inheritance */ + int wired_count; /* can be paged if == 0 */ - struct vm_aref aref; /* anonymous overlay */ int advice; /* madvise advice */ #define uvm_map_entry_stop_copy flags u_int8_t flags; /* flags */ @@ -156,18 +199,19 @@ struct vm_map_entry { #define UVM_MAP_KMEM 0x02 /* from kmem entry pool */ }; -/* - * Marks the map entry as a guard page, using vm_map_entry.etype. - */ -#define MAP_ET_KVAGUARD 0x10 /* guard entry */ - #define VM_MAPENT_ISWIRED(entry) ((entry)->wired_count != 0) +TAILQ_HEAD(uvm_map_deadq, vm_map_entry); /* dead entry queue */ +RB_HEAD(uvm_map_addr, vm_map_entry); +RB_PROTOTYPE(uvm_map_addr, vm_map_entry, daddrs.addr_entry, + uvm_mapentry_addrcmp); + /* - * Maps are doubly-linked lists of map entries, kept sorted - * by address. A single hint is provided to start - * searches again from the last successful search, - * insertion, or removal. + * A Map is a rbtree of map entries, kept sorted by address. + * In addition, free space entries are also kept in a rbtree, + * indexed by free size. + * + * * * LOCKING PROTOCOL NOTES: * ----------------------- @@ -214,23 +258,80 @@ struct vm_map_entry { * is busy, and thread is attempting * to write-lock. must be tested * while `flags_lock' is asserted. + * + * VM_MAP_GUARDPAGES r/o; must be specified at map + * initialization time. + * If set, guards will appear between + * automatic allocations. + * No locking required. + * + * VM_MAP_ISVMSPACE r/o; set by uvmspace_alloc. + * Signifies that this map is a vmspace. + * (The implementation treats all maps + * without this bit as kernel maps.) + * No locking required. + * + * + * All automatic allocations (uvm_map without MAP_FIXED) will allocate + * from vm_map.free. + * If that allocation fails: + * - vmspace maps will spill over into vm_map.bfree, + * - all other maps will call uvm_map_kmem_grow() to increase the arena. + * + * vmspace maps have their data, brk() and stack arenas automatically + * updated when uvm_map() is invoked without MAP_FIXED. + * The spill over arena (vm_map.bfree) will contain the space in the brk() + * and stack ranges. + * Kernel maps never have a bfree arena and this tree will always be empty. + * + * + * read_locks and write_locks are used in lock debugging code. */ struct vm_map { struct pmap * pmap; /* Physical map */ struct rwlock lock; /* Lock for map data */ - RB_HEAD(uvm_tree, vm_map_entry) rbhead; /* Tree for entries */ - struct vm_map_entry header; /* List of entries */ - int nentries; /* Number of entries */ + + struct uvm_map_addr addr; /* Entry tree, by addr */ + vsize_t size; /* virtual size */ int ref_count; /* Reference count */ simple_lock_data_t ref_lock; /* Lock for ref_count field */ - vm_map_entry_t hint; /* hint for quick lookups */ - simple_lock_data_t hint_lock; /* lock for hint storage */ - vm_map_entry_t first_free; /* First free space hint */ int flags; /* flags */ unsigned int timestamp; /* Version number */ -#define min_offset header.start -#define max_offset header.end + + vaddr_t min_offset; /* First address in map. */ + vaddr_t max_offset; /* Last address in map. */ + + /* + * Allocation overflow regions. + */ + vaddr_t b_start; /* Start for brk() alloc. */ + vaddr_t b_end; /* End for brk() alloc. */ + vaddr_t s_start; /* Start for stack alloc. */ + vaddr_t s_end; /* End for stack alloc. */ + + /* + * Special address selectors. + * + * The uaddr_exe mapping is used if: + * - protX is selected + * - the pointer is not NULL + * + * If uaddr_exe is not used, the other mappings are checked in + * order of appearance. + * If a hint is given, the selection will only be used if the hint + * falls in the range described by the mapping. + * + * The states are pointers because: + * - they may not all be in use + * - the struct size for different schemes is variable + * + * The uaddr_brk_stack selector will select addresses that are in + * the brk/stack area of the map. + */ + struct uvm_addr_state *uaddr_exe; /* Executable selector. */ + struct uvm_addr_state *uaddr_any[4]; /* More selectors. */ + struct uvm_addr_state *uaddr_brk_stack; /* Brk/stack selector. */ }; /* vm_map flags */ @@ -239,11 +340,13 @@ struct vm_map { #define VM_MAP_WIREFUTURE 0x04 /* rw: wire future mappings */ #define VM_MAP_BUSY 0x08 /* rw: map is busy */ #define VM_MAP_WANTLOCK 0x10 /* rw: want to write-lock */ +#define VM_MAP_GUARDPAGES 0x20 /* rw: add guard pgs to map */ +#define VM_MAP_ISVMSPACE 0x40 /* ro: map is a vmspace */ /* XXX: number of kernel maps and entries to statically allocate */ #if !defined(MAX_KMAPENT) -#define MAX_KMAPENT 1024 /* XXXCDC: no crash */ +#define MAX_KMAPENT 1024 /* Sufficient to make it to the scheduler. */ #endif /* !defined MAX_KMAPENT */ #ifdef _KERNEL @@ -268,9 +371,7 @@ struct vm_map_intrsafe { #ifdef _KERNEL -#ifdef PMAP_GROWKERNEL extern vaddr_t uvm_maxkaddr; -#endif /* * protos: the following prototypes define the interface to vm_map @@ -279,32 +380,29 @@ extern vaddr_t uvm_maxkaddr; void uvm_map_deallocate(vm_map_t); int uvm_map_clean(vm_map_t, vaddr_t, vaddr_t, int); -void uvm_map_clip_start(vm_map_t, vm_map_entry_t, vaddr_t); -void uvm_map_clip_end(vm_map_t, vm_map_entry_t, vaddr_t); vm_map_t uvm_map_create(pmap_t, vaddr_t, vaddr_t, int); -int uvm_map_extract(vm_map_t, vaddr_t, vsize_t, - vm_map_t, vaddr_t *, int); -vm_map_entry_t uvm_map_findspace(vm_map_t, vaddr_t, vsize_t, vaddr_t *, - struct uvm_object *, voff_t, vsize_t, int); +int uvm_map_extract(struct vm_map*, vaddr_t, vsize_t, vaddr_t*, + int); vaddr_t uvm_map_pie(vaddr_t); -#define uvm_map_hint(p, prot) uvm_map_hint1(p, prot, 1) -vaddr_t uvm_map_hint1(struct proc *, vm_prot_t, int); +vaddr_t uvm_map_hint(struct vmspace *, vm_prot_t); int uvm_map_inherit(vm_map_t, vaddr_t, vaddr_t, vm_inherit_t); int uvm_map_advice(vm_map_t, vaddr_t, vaddr_t, int); void uvm_map_init(void); boolean_t uvm_map_lookup_entry(vm_map_t, vaddr_t, vm_map_entry_t *); -void uvm_map_reference(vm_map_t); int uvm_map_replace(vm_map_t, vaddr_t, vaddr_t, vm_map_entry_t, int); int uvm_map_reserve(vm_map_t, vsize_t, vaddr_t, vsize_t, vaddr_t *); void uvm_map_setup(vm_map_t, vaddr_t, vaddr_t, int); int uvm_map_submap(vm_map_t, vaddr_t, vaddr_t, vm_map_t); -#define uvm_unmap(_m, _s, _e) uvm_unmap_p(_m, _s, _e, 0) -void uvm_unmap_p(vm_map_t, vaddr_t, vaddr_t, struct proc *); -void uvm_unmap_detach(vm_map_entry_t,int); -void uvm_unmap_remove(vm_map_t, vaddr_t, vaddr_t, vm_map_entry_t *, - struct proc *, boolean_t); +void uvm_unmap(vm_map_t, vaddr_t, vaddr_t); +void uvm_map_set_uaddr(struct vm_map*, struct uvm_addr_state**, + struct uvm_addr_state*); +int uvm_map_mquery(struct vm_map*, vaddr_t*, vsize_t, voff_t, int); + +void uvm_unmap_detach(struct uvm_map_deadq*, int); +void uvm_unmap_remove(struct vm_map*, vaddr_t, vaddr_t, + struct uvm_map_deadq*, boolean_t, boolean_t); #endif /* _KERNEL */ @@ -337,82 +435,45 @@ void uvm_unmap_remove(vm_map_t, vaddr_t, vaddr_t, vm_map_entry_t *, */ #ifdef _KERNEL -/* XXX: clean up later */ +/* + * XXX: clean up later + * Half the kernel seems to depend on them being included here. + */ #include -#include /* for panic() */ - -static __inline boolean_t vm_map_lock_try(vm_map_t); -static __inline void vm_map_lock(vm_map_t); -extern const char vmmapbsy[]; - -static __inline boolean_t -vm_map_lock_try(struct vm_map *map) -{ - boolean_t rv; - - if (map->flags & VM_MAP_INTRSAFE) { - rv = TRUE; - } else { - if (map->flags & VM_MAP_BUSY) { - return (FALSE); - } - rv = (rw_enter(&map->lock, RW_WRITE|RW_NOSLEEP) == 0); - } - - if (rv) - map->timestamp++; - - return (rv); -} - -static __inline void -vm_map_lock(struct vm_map *map) -{ - if (map->flags & VM_MAP_INTRSAFE) - return; - - do { - while (map->flags & VM_MAP_BUSY) { - map->flags |= VM_MAP_WANTLOCK; - tsleep(&map->flags, PVM, (char *)vmmapbsy, 0); - } - } while (rw_enter(&map->lock, RW_WRITE|RW_SLEEPFAIL) != 0); - - map->timestamp++; -} - -#define vm_map_lock_read(map) rw_enter_read(&(map)->lock) - -#define vm_map_unlock(map) \ -do { \ - if (((map)->flags & VM_MAP_INTRSAFE) == 0) \ - rw_exit(&(map)->lock); \ -} while (0) - -#define vm_map_unlock_read(map) rw_exit_read(&(map)->lock) - -#define vm_map_downgrade(map) rw_enter(&(map)->lock, RW_DOWNGRADE) - -#define vm_map_upgrade(map) \ -do { \ - rw_exit_read(&(map)->lock); \ - rw_enter_write(&(map)->lock); \ -} while (0) - -#define vm_map_busy(map) \ -do { \ - (map)->flags |= VM_MAP_BUSY; \ -} while (0) +#include /* for panic() */ + +boolean_t vm_map_lock_try_ln(struct vm_map*, char*, int); +void vm_map_lock_ln(struct vm_map*, char*, int); +void vm_map_lock_read_ln(struct vm_map*, char*, int); +void vm_map_unlock_ln(struct vm_map*, char*, int); +void vm_map_unlock_read_ln(struct vm_map*, char*, int); +void vm_map_downgrade_ln(struct vm_map*, char*, int); +void vm_map_upgrade_ln(struct vm_map*, char*, int); +void vm_map_busy_ln(struct vm_map*, char*, int); +void vm_map_unbusy_ln(struct vm_map*, char*, int); + +#ifdef DIAGNOSTIC +#define vm_map_lock_try(map) vm_map_lock_try_ln(map, __FILE__, __LINE__) +#define vm_map_lock(map) vm_map_lock_ln(map, __FILE__, __LINE__) +#define vm_map_lock_read(map) vm_map_lock_read_ln(map, __FILE__, __LINE__) +#define vm_map_unlock(map) vm_map_unlock_ln(map, __FILE__, __LINE__) +#define vm_map_unlock_read(map) vm_map_unlock_read_ln(map, __FILE__, __LINE__) +#define vm_map_downgrade(map) vm_map_downgrade_ln(map, __FILE__, __LINE__) +#define vm_map_upgrade(map) vm_map_upgrade_ln(map, __FILE__, __LINE__) +#define vm_map_busy(map) vm_map_busy_ln(map, __FILE__, __LINE__) +#define vm_map_unbusy(map) vm_map_unbusy_ln(map, __FILE__, __LINE__) +#else +#define vm_map_lock_try(map) vm_map_lock_try_ln(map, NULL, 0) +#define vm_map_lock(map) vm_map_lock_ln(map, NULL, 0) +#define vm_map_lock_read(map) vm_map_lock_read_ln(map, NULL, 0) +#define vm_map_unlock(map) vm_map_unlock_ln(map, NULL, 0) +#define vm_map_unlock_read(map) vm_map_unlock_read_ln(map, NULL, 0) +#define vm_map_downgrade(map) vm_map_downgrade_ln(map, NULL, 0) +#define vm_map_upgrade(map) vm_map_upgrade_ln(map, NULL, 0) +#define vm_map_busy(map) vm_map_busy_ln(map, NULL, 0) +#define vm_map_unbusy(map) vm_map_unbusy_ln(map, NULL, 0) +#endif -#define vm_map_unbusy(map) \ -do { \ - int oflags; \ - \ - oflags = (map)->flags; \ - (map)->flags &= ~(VM_MAP_BUSY|VM_MAP_WANTLOCK); \ - if (oflags & VM_MAP_WANTLOCK) \ - wakeup(&(map)->flags); \ -} while (0) #endif /* _KERNEL */ /* diff --git a/sys/uvm/uvm_mmap.c b/sys/uvm/uvm_mmap.c index a96deed4052..6817224e3ce 100644 --- a/sys/uvm/uvm_mmap.c +++ b/sys/uvm/uvm_mmap.c @@ -1,4 +1,4 @@ -/* $OpenBSD: uvm_mmap.c,v 1.87 2011/07/09 05:31:26 matthew Exp $ */ +/* $OpenBSD: uvm_mmap.c,v 1.88 2012/03/09 13:01:29 ariane Exp $ */ /* $NetBSD: uvm_mmap.c,v 1.49 2001/02/18 21:19:08 chs Exp $ */ /* @@ -142,43 +142,17 @@ sys_mquery(struct proc *p, void *v, register_t *retval) } else { fp = NULL; uobj = NULL; - uoff = 0; + uoff = UVM_UNKNOWN_OFFSET; } if (vaddr == 0) - vaddr = uvm_map_hint(p, prot); + vaddr = uvm_map_hint(p->p_vmspace, prot); - /* prevent a user requested address from falling in heap space */ - if ((vaddr + size > (vaddr_t)p->p_vmspace->vm_daddr) && - (vaddr < (vaddr_t)p->p_vmspace->vm_daddr + BRKSIZ)) { - if (flags & UVM_FLAG_FIXED) { - error = EINVAL; - goto done; - } - vaddr = round_page((vaddr_t)p->p_vmspace->vm_daddr + BRKSIZ); - } - vm_map_lock(&p->p_vmspace->vm_map); - -again: - if (uvm_map_findspace(&p->p_vmspace->vm_map, vaddr, size, - &vaddr, uobj, uoff, 0, flags) == NULL) { - if (flags & UVM_FLAG_FIXED) - error = EINVAL; - else - error = ENOMEM; - } else { - /* prevent a returned address from falling in heap space */ - if ((vaddr + size > (vaddr_t)p->p_vmspace->vm_daddr) - && (vaddr < (vaddr_t)p->p_vmspace->vm_daddr + BRKSIZ)) { - vaddr = round_page((vaddr_t)p->p_vmspace->vm_daddr + - BRKSIZ); - goto again; - } - error = 0; + error = uvm_map_mquery(&p->p_vmspace->vm_map, &vaddr, size, uoff, + flags); + if (error == 0) *retval = (register_t)(vaddr); - } - vm_map_unlock(&p->p_vmspace->vm_map); -done: + if (fp != NULL) FRELE(fp); return (error); @@ -202,7 +176,7 @@ sys_mincore(struct proc *p, void *v, register_t *retval) struct uvm_object *uobj; struct vm_amap *amap; struct vm_anon *anon; - vm_map_entry_t entry; + vm_map_entry_t entry, next; vaddr_t start, end, lim; vm_map_t map; vsize_t len, npgs; @@ -251,15 +225,16 @@ sys_mincore(struct proc *p, void *v, register_t *retval) } for (/* nothing */; - entry != &map->header && entry->start < end; - entry = entry->next) { + entry != NULL && entry->start < end; + entry = RB_NEXT(uvm_map_addr, &map->addr, entry)) { KASSERT(!UVM_ET_ISSUBMAP(entry)); KASSERT(start >= entry->start); /* Make sure there are no holes. */ + next = RB_NEXT(uvm_map_addr, &map->addr, entry); if (entry->end < end && - (entry->next == &map->header || - entry->next->start > entry->end)) { + (next == NULL || + next->start > entry->end)) { error = ENOMEM; goto out; } @@ -412,17 +387,6 @@ sys_mmap(struct proc *p, void *v, register_t *retval) if (vm_min_address > 0 && addr < vm_min_address) return (EINVAL); - } else { - - /* - * not fixed: make sure we skip over the largest possible heap. - * we will refine our guess later (e.g. to account for VAC, etc) - */ - if (addr == 0) - addr = uvm_map_hint(p, prot); - else if (!(flags & MAP_TRYFIXED) && - addr < (vaddr_t)p->p_vmspace->vm_daddr) - addr = uvm_map_hint(p, prot); } /* @@ -565,13 +529,6 @@ sys_mmap(struct proc *p, void *v, register_t *retval) error = uvm_mmap(&p->p_vmspace->vm_map, &addr, size, prot, maxprot, flags, handle, pos, p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur, p); - if (error == ENOMEM && !(flags & (MAP_FIXED | MAP_TRYFIXED))) { - /* once more, with feeling */ - addr = uvm_map_hint1(p, prot, 0); - error = uvm_mmap(&p->p_vmspace->vm_map, &addr, size, prot, - maxprot, flags, handle, pos, - p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur, p); - } if (error == 0) /* remember to add offset */ @@ -658,7 +615,7 @@ sys_munmap(struct proc *p, void *v, register_t *retval) vsize_t size, pageoff; vm_map_t map; vaddr_t vm_min_address = VM_MIN_ADDRESS; - struct vm_map_entry *dead_entries; + struct uvm_map_deadq dead_entries; /* * get syscall args... @@ -700,12 +657,12 @@ sys_munmap(struct proc *p, void *v, register_t *retval) /* * doit! */ - uvm_unmap_remove(map, addr, addr + size, &dead_entries, p, FALSE); + TAILQ_INIT(&dead_entries); + uvm_unmap_remove(map, addr, addr + size, &dead_entries, FALSE, TRUE); vm_map_unlock(map); /* and unlock */ - if (dead_entries != NULL) - uvm_unmap_detach(dead_entries, 0); + uvm_unmap_detach(&dead_entries, 0); return (0); } @@ -1036,7 +993,7 @@ uvm_mmap(vm_map_t map, vaddr_t *addr, vsize_t size, vm_prot_t prot, if (*addr & PAGE_MASK) return(EINVAL); uvmflag |= UVM_FLAG_FIXED; - uvm_unmap_p(map, *addr, *addr + size, p); /* zap! */ + uvm_unmap(map, *addr, *addr + size); /* zap! */ } /* @@ -1130,7 +1087,7 @@ uvm_mmap(vm_map_t map, vaddr_t *addr, vsize_t size, vm_prot_t prot, (flags & MAP_SHARED) ? UVM_INH_SHARE : UVM_INH_COPY, advice, uvmflag); - error = uvm_map_p(map, addr, size, uobj, foff, align, uvmflag, p); + error = uvm_map(map, addr, size, uobj, foff, align, uvmflag); if (error == 0) { /* diff --git a/sys/uvm/uvm_unix.c b/sys/uvm/uvm_unix.c index 06cbf871e41..f4d4490b853 100644 --- a/sys/uvm/uvm_unix.c +++ b/sys/uvm/uvm_unix.c @@ -1,4 +1,4 @@ -/* $OpenBSD: uvm_unix.c,v 1.42 2011/06/06 17:10:23 ariane Exp $ */ +/* $OpenBSD: uvm_unix.c,v 1.43 2012/03/09 13:01:29 ariane Exp $ */ /* $NetBSD: uvm_unix.c,v 1.18 2000/09/13 15:00:25 thorpej Exp $ */ /* @@ -167,9 +167,7 @@ uvm_coredump(struct proc *p, struct vnode *vp, struct ucred *cred, offset = chdr->c_hdrsize + chdr->c_seghdrsize + chdr->c_cpusize; - for (entry = map->header.next; entry != &map->header; - entry = entry->next) { - + RB_FOREACH(entry, uvm_map_addr, &map->addr) { /* should never happen for a user process */ if (UVM_ET_ISSUBMAP(entry)) { panic("uvm_coredump: user process with submap?"); @@ -261,9 +259,7 @@ uvm_coredump_walkmap(struct proc *p, void *iocookie, vaddr_t top; int error; - for (entry = map->header.next; entry != &map->header; - entry = entry->next) { - + RB_FOREACH(entry, uvm_map_addr, &map->addr) { state.cookie = cookie; state.prot = entry->protection; state.flags = 0; -- cgit v1.2.3