diff options
author | Artur Grabowski <art@cvs.openbsd.org> | 1999-02-26 01:30:19 +0000 |
---|---|---|
committer | Artur Grabowski <art@cvs.openbsd.org> | 1999-02-26 01:30:19 +0000 |
commit | 4ced8be00ce0c7e0fd9c1cf69ccbfc205eef858b (patch) | |
tree | 5a49b50d778c28ec36cf057ebca6a75a3897676c /sys/uvm/uvm_swap.c | |
parent | 569eeca81ccb896a3bce285b37ac9810765b6c39 (diff) |
Import of uvm from NetBSD. Some local changes, some code disabled
Diffstat (limited to 'sys/uvm/uvm_swap.c')
-rw-r--r-- | sys/uvm/uvm_swap.c | 1977 |
1 files changed, 1977 insertions, 0 deletions
diff --git a/sys/uvm/uvm_swap.c b/sys/uvm/uvm_swap.c new file mode 100644 index 00000000000..9fb7611e7a5 --- /dev/null +++ b/sys/uvm/uvm_swap.c @@ -0,0 +1,1977 @@ +/* $NetBSD: uvm_swap.c,v 1.23 1998/12/26 06:25:59 marc Exp $ */ + +/* + * Copyright (c) 1995, 1996, 1997 Matthew R. Green + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: NetBSD: vm_swap.c,v 1.52 1997/12/02 13:47:37 pk Exp + * from: Id: uvm_swap.c,v 1.1.2.42 1998/02/02 20:38:06 chuck Exp + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/buf.h> +#include <sys/proc.h> +#include <sys/namei.h> +#include <sys/disklabel.h> +#include <sys/errno.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/extent.h> +#include <sys/mount.h> +#include <sys/pool.h> +#include <sys/syscallargs.h> +#include <sys/swap.h> + +#include <vm/vm.h> +#include <vm/vm_conf.h> + +#include <uvm/uvm.h> + +#include <miscfs/specfs/specdev.h> + +/* + * uvm_swap.c: manage configuration and i/o to swap space. + */ + +/* + * swap space is managed in the following way: + * + * each swap partition or file is described by a "swapdev" structure. + * each "swapdev" structure contains a "swapent" structure which contains + * information that is passed up to the user (via system calls). + * + * each swap partition is assigned a "priority" (int) which controls + * swap parition usage. + * + * the system maintains a global data structure describing all swap + * partitions/files. there is a sorted LIST of "swappri" structures + * which describe "swapdev"'s at that priority. this LIST is headed + * by the "swap_priority" global var. each "swappri" contains a + * CIRCLEQ of "swapdev" structures at that priority. + * + * the system maintains a fixed pool of "swapbuf" structures for use + * at swap i/o time. a swapbuf includes a "buf" structure and an + * "aiodone" [we want to avoid malloc()'ing anything at swapout time + * since memory may be low]. + * + * locking: + * - swap_syscall_lock (sleep lock): this lock serializes the swapctl + * system call and prevents the swap priority list from changing + * while we are in the middle of a system call (e.g. SWAP_STATS). + * - swap_data_lock (simple_lock): this lock protects all swap data + * structures including the priority list, the swapdev structures, + * and the swapmap extent. + * - swap_buf_lock (simple_lock): this lock protects the free swapbuf + * pool. + * + * each swap device has the following info: + * - swap device in use (could be disabled, preventing future use) + * - swap enabled (allows new allocations on swap) + * - map info in /dev/drum + * - vnode pointer + * for swap files only: + * - block size + * - max byte count in buffer + * - buffer + * - credentials to use when doing i/o to file + * + * userland controls and configures swap with the swapctl(2) system call. + * the sys_swapctl performs the following operations: + * [1] SWAP_NSWAP: returns the number of swap devices currently configured + * [2] SWAP_STATS: given a pointer to an array of swapent structures + * (passed in via "arg") of a size passed in via "misc" ... we load + * the current swap config into the array. + * [3] SWAP_ON: given a pathname in arg (could be device or file) and a + * priority in "misc", start swapping on it. + * [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device + * [5] SWAP_CTL: changes the priority of a swap device (new priority in + * "misc") + */ + +/* + * SWAP_TO_FILES: allows swapping to plain files. + */ + +#define SWAP_TO_FILES + +/* + * swapdev: describes a single swap partition/file + * + * note the following should be true: + * swd_inuse <= swd_nblks [number of blocks in use is <= total blocks] + * swd_nblks <= swd_mapsize [because mapsize includes miniroot+disklabel] + */ +struct swapdev { + struct oswapent swd_ose; +#define swd_dev swd_ose.ose_dev /* device id */ +#define swd_flags swd_ose.ose_flags /* flags:inuse/enable/fake */ +#define swd_priority swd_ose.ose_priority /* our priority */ + /* also: swd_ose.ose_nblks, swd_ose.ose_inuse */ + char *swd_path; /* saved pathname of device */ + int swd_pathlen; /* length of pathname */ + int swd_npages; /* #pages we can use */ + int swd_npginuse; /* #pages in use */ + int swd_drumoffset; /* page0 offset in drum */ + int swd_drumsize; /* #pages in drum */ + struct extent *swd_ex; /* extent for this swapdev */ + struct vnode *swd_vp; /* backing vnode */ + CIRCLEQ_ENTRY(swapdev) swd_next; /* priority circleq */ + +#ifdef SWAP_TO_FILES + int swd_bsize; /* blocksize (bytes) */ + int swd_maxactive; /* max active i/o reqs */ + struct buf swd_tab; /* buffer list */ + struct ucred *swd_cred; /* cred for file access */ +#endif +}; + +/* + * swap device priority entry; the list is kept sorted on `spi_priority'. + */ +struct swappri { + int spi_priority; /* priority */ + CIRCLEQ_HEAD(spi_swapdev, swapdev) spi_swapdev; + /* circleq of swapdevs at this priority */ + LIST_ENTRY(swappri) spi_swappri; /* global list of pri's */ +}; + +/* + * swapbuf, swapbuffer plus async i/o info + */ +struct swapbuf { + struct buf sw_buf; /* a buffer structure */ + struct uvm_aiodesc sw_aio; /* aiodesc structure, used if ASYNC */ + SIMPLEQ_ENTRY(swapbuf) sw_sq; /* free list pointer */ +}; + +/* + * The following two structures are used to keep track of data transfers + * on swap devices associated with regular files. + * NOTE: this code is more or less a copy of vnd.c; we use the same + * structure names here to ease porting.. + */ +struct vndxfer { + struct buf *vx_bp; /* Pointer to parent buffer */ + struct swapdev *vx_sdp; + int vx_error; + int vx_pending; /* # of pending aux buffers */ + int vx_flags; +#define VX_BUSY 1 +#define VX_DEAD 2 +}; + +struct vndbuf { + struct buf vb_buf; + struct vndxfer *vb_xfer; +}; + + +/* + * We keep a of pool vndbuf's and vndxfer structures. + */ +struct pool *vndxfer_pool; +struct pool *vndbuf_pool; + +#define getvndxfer(vnx) do { \ + int s = splbio(); \ + vnx = (struct vndxfer *) \ + pool_get(vndxfer_pool, PR_MALLOCOK|PR_WAITOK); \ + splx(s); \ +} while (0) + +#define putvndxfer(vnx) { \ + pool_put(vndxfer_pool, (void *)(vnx)); \ +} + +#define getvndbuf(vbp) do { \ + int s = splbio(); \ + vbp = (struct vndbuf *) \ + pool_get(vndbuf_pool, PR_MALLOCOK|PR_WAITOK); \ + splx(s); \ +} while (0) + +#define putvndbuf(vbp) { \ + pool_put(vndbuf_pool, (void *)(vbp)); \ +} + + +/* + * local variables + */ +static struct extent *swapmap; /* controls the mapping of /dev/drum */ +SIMPLEQ_HEAD(swapbufhead, swapbuf); +struct pool *swapbuf_pool; + +/* list of all active swap devices [by priority] */ +LIST_HEAD(swap_priority, swappri); +static struct swap_priority swap_priority; + +/* locks */ +lock_data_t swap_syscall_lock; +static simple_lock_data_t swap_data_lock; + +/* + * prototypes + */ +#ifdef notyet +static void swapdrum_add __P((struct swapdev *, int)); +#endif +static struct swapdev *swapdrum_getsdp __P((int)); + +#ifdef notyet /* swapctl */ +static struct swapdev *swaplist_find __P((struct vnode *, int)); +static void swaplist_insert __P((struct swapdev *, + struct swappri *, int)); +static void swaplist_trim __P((void)); + +static int swap_on __P((struct proc *, struct swapdev *)); +#endif +#ifdef SWAP_OFF_WORKS +static int swap_off __P((struct proc *, struct swapdev *)); +#endif + +#ifdef SWAP_TO_FILES +static void sw_reg_strategy __P((struct swapdev *, struct buf *, int)); +static void sw_reg_iodone __P((struct buf *)); +static void sw_reg_start __P((struct swapdev *)); +#endif + +static void uvm_swap_aiodone __P((struct uvm_aiodesc *)); +static void uvm_swap_bufdone __P((struct buf *)); +static int uvm_swap_io __P((struct vm_page **, int, int, int)); + +/* + * uvm_swap_init: init the swap system data structures and locks + * + * => called at boot time from init_main.c after the filesystems + * are brought up (which happens after uvm_init()) + */ +void +uvm_swap_init() +{ + UVMHIST_FUNC("uvm_swap_init"); + + UVMHIST_CALLED(pdhist); + /* + * first, init the swap list, its counter, and its lock. + * then get a handle on the vnode for /dev/drum by using + * the its dev_t number ("swapdev", from MD conf.c). + */ + + LIST_INIT(&swap_priority); + uvmexp.nswapdev = 0; + lockinit(&swap_syscall_lock, PVM, "swapsys", 0, 0); + simple_lock_init(&swap_data_lock); + + if (bdevvp(swapdev, &swapdev_vp)) + panic("uvm_swap_init: can't get vnode for swap device"); + + /* + * create swap block resource map to map /dev/drum. the range + * from 1 to INT_MAX allows 2 gigablocks of swap space. note + * that block 0 is reserved (used to indicate an allocation + * failure, or no allocation). + */ + swapmap = extent_create("swapmap", 1, INT_MAX, + M_VMSWAP, 0, 0, EX_NOWAIT); + if (swapmap == 0) + panic("uvm_swap_init: extent_create failed"); + + /* + * allocate our private pool of "swapbuf" structures (includes + * a "buf" structure). ["nswbuf" comes from param.c and can + * be adjusted by MD code before we get here]. + */ + + swapbuf_pool = + pool_create(sizeof(struct swapbuf), 0, 0, 0, "swp buf", 0, + NULL, NULL, 0); + if (swapbuf_pool == NULL) + panic("swapinit: pool_create failed"); + /* XXX - set a maximum on swapbuf_pool? */ + + vndxfer_pool = + pool_create(sizeof(struct vndxfer), 0, 0, 0, "swp vnx", 0, + NULL, NULL, 0); + if (vndxfer_pool == NULL) + panic("swapinit: pool_create failed"); + + vndbuf_pool = + pool_create(sizeof(struct vndbuf), 0, 0, 0, "swp vnd", 0, + NULL, NULL, 0); + if (vndbuf_pool == NULL) + panic("swapinit: pool_create failed"); + /* + * done! + */ + UVMHIST_LOG(pdhist, "<- done", 0, 0, 0, 0); +} + +/* + * swaplist functions: functions that operate on the list of swap + * devices on the system. + */ + +/* + * swaplist_insert: insert swap device "sdp" into the global list + * + * => caller must hold both swap_syscall_lock and swap_data_lock + * => caller must provide a newly malloc'd swappri structure (we will + * FREE it if we don't need it... this it to prevent malloc blocking + * here while adding swap) + */ +#ifdef notyet /* used by swapctl */ +static void +swaplist_insert(sdp, newspp, priority) + struct swapdev *sdp; + struct swappri *newspp; + int priority; +{ + struct swappri *spp, *pspp; + UVMHIST_FUNC("swaplist_insert"); UVMHIST_CALLED(pdhist); + + /* + * find entry at or after which to insert the new device. + */ + for (pspp = NULL, spp = swap_priority.lh_first; spp != NULL; + spp = spp->spi_swappri.le_next) { + if (priority <= spp->spi_priority) + break; + pspp = spp; + } + + /* + * new priority? + */ + if (spp == NULL || spp->spi_priority != priority) { + spp = newspp; /* use newspp! */ + UVMHIST_LOG(pdhist, "created new swappri = %d", priority, 0, 0, 0); + + spp->spi_priority = priority; + CIRCLEQ_INIT(&spp->spi_swapdev); + + if (pspp) + LIST_INSERT_AFTER(pspp, spp, spi_swappri); + else + LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri); + } else { + /* we don't need a new priority structure, free it */ + FREE(newspp, M_VMSWAP); + } + + /* + * priority found (or created). now insert on the priority's + * circleq list and bump the total number of swapdevs. + */ + sdp->swd_priority = priority; + CIRCLEQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next); + uvmexp.nswapdev++; + + /* + * done! + */ +} +#endif + +#ifdef notyet /* used by swapctl */ +/* + * swaplist_find: find and optionally remove a swap device from the + * global list. + * + * => caller must hold both swap_syscall_lock and swap_data_lock + * => we return the swapdev we found (and removed) + */ +static struct swapdev * +swaplist_find(vp, remove) + struct vnode *vp; + boolean_t remove; +{ + struct swapdev *sdp; + struct swappri *spp; + + /* + * search the lists for the requested vp + */ + for (spp = swap_priority.lh_first; spp != NULL; + spp = spp->spi_swappri.le_next) { + for (sdp = spp->spi_swapdev.cqh_first; + sdp != (void *)&spp->spi_swapdev; + sdp = sdp->swd_next.cqe_next) + if (sdp->swd_vp == vp) { + if (remove) { + CIRCLEQ_REMOVE(&spp->spi_swapdev, + sdp, swd_next); + uvmexp.nswapdev--; + } + return(sdp); + } + } + return (NULL); +} + + +/* + * swaplist_trim: scan priority list for empty priority entries and kill + * them. + * + * => caller must hold both swap_syscall_lock and swap_data_lock + */ +static void +swaplist_trim() +{ + struct swappri *spp, *nextspp; + + for (spp = swap_priority.lh_first; spp != NULL; spp = nextspp) { + nextspp = spp->spi_swappri.le_next; + if (spp->spi_swapdev.cqh_first != (void *)&spp->spi_swapdev) + continue; + LIST_REMOVE(spp, spi_swappri); + free((caddr_t)spp, M_VMSWAP); + } +} + +/* + * swapdrum_add: add a "swapdev"'s blocks into /dev/drum's area. + * + * => caller must hold swap_syscall_lock + * => swap_data_lock should be unlocked (we may sleep) + */ +static void +swapdrum_add(sdp, npages) + struct swapdev *sdp; + int npages; +{ + u_long result; + + if (extent_alloc(swapmap, npages, EX_NOALIGN, EX_NOBOUNDARY, + EX_WAITOK, &result)) + panic("swapdrum_add"); + + sdp->swd_drumoffset = result; + sdp->swd_drumsize = npages; +} +#endif + +/* + * swapdrum_getsdp: given a page offset in /dev/drum, convert it back + * to the "swapdev" that maps that section of the drum. + * + * => each swapdev takes one big contig chunk of the drum + * => caller must hold swap_data_lock + */ +static struct swapdev * +swapdrum_getsdp(pgno) + int pgno; +{ + struct swapdev *sdp; + struct swappri *spp; + + for (spp = swap_priority.lh_first; spp != NULL; + spp = spp->spi_swappri.le_next) + for (sdp = spp->spi_swapdev.cqh_first; + sdp != (void *)&spp->spi_swapdev; + sdp = sdp->swd_next.cqe_next) + if (pgno >= sdp->swd_drumoffset && + pgno < (sdp->swd_drumoffset + sdp->swd_drumsize)) { + return sdp; + } + return NULL; +} + + +/*XXX + *XXX + *XXX*/ +int +sys_swapon(p, v, retval) + struct proc *p; + void *v; + register_t *retval; +{ + return EINVAL; +} + +#ifdef notyet /* XXXXXXXXXXXXXXXX (it has other bugs beside the fact that I don't want to change syscalls.master) */ +/* + * sys_swapctl: main entry point for swapctl(2) system call + * [with two helper functions: swap_on and swap_off] + */ +int +sys_swapctl(p, v, retval) + struct proc *p; + void *v; + register_t *retval; +{ + struct sys_swapctl_args /* { + syscallarg(int) cmd; + syscallarg(void *) arg; + syscallarg(int) misc; + } */ *uap = (struct sys_swapctl_args *)v; + struct vnode *vp; + struct nameidata nd; + struct swappri *spp; + struct swapdev *sdp; + struct swapent *sep; + char userpath[PATH_MAX + 1]; + size_t len; + int count, error, misc; + int priority; + UVMHIST_FUNC("sys_swapctl"); UVMHIST_CALLED(pdhist); + + misc = SCARG(uap, misc); + + /* + * ensure serialized syscall access by grabbing the swap_syscall_lock + */ + lockmgr(&swap_syscall_lock, LK_EXCLUSIVE, (void *)0, p); + + /* + * we handle the non-priv NSWAP and STATS request first. + * + * SWAP_NSWAP: return number of config'd swap devices + * [can also be obtained with uvmexp sysctl] + */ + if (SCARG(uap, cmd) == SWAP_NSWAP) { + UVMHIST_LOG(pdhist, "<- done SWAP_NSWAP=%d", uvmexp.nswapdev, + 0, 0, 0); + *retval = uvmexp.nswapdev; + error = 0; + goto out; + } + + /* + * SWAP_STATS: get stats on current # of configured swap devs + * + * note that the swap_priority list can't change as long + * as we are holding the swap_syscall_lock. we don't want + * to grab the swap_data_lock because we may fault&sleep during + * copyout() and we don't want to be holding that lock then! + */ + if (SCARG(uap, cmd) == SWAP_STATS +#if defined(COMPAT_13) + || SCARG(uap, cmd) == SWAP_OSTATS +#endif + ) { + sep = (struct swapent *)SCARG(uap, arg); + count = 0; + + for (spp = swap_priority.lh_first; spp != NULL; + spp = spp->spi_swappri.le_next) { + for (sdp = spp->spi_swapdev.cqh_first; + sdp != (void *)&spp->spi_swapdev && misc-- > 0; + sdp = sdp->swd_next.cqe_next) { + /* + * backwards compatibility for system call. + * note that we use 'struct oswapent' as an + * overlay into both 'struct swapdev' and + * the userland 'struct swapent', as we + * want to retain backwards compatibility + * with NetBSD 1.3. + */ + sdp->swd_ose.ose_inuse = + btodb(sdp->swd_npginuse << PAGE_SHIFT); + error = copyout((caddr_t)&sdp->swd_ose, + (caddr_t)sep, sizeof(struct oswapent)); + + /* now copy out the path if necessary */ +#if defined(COMPAT_13) + if (error == 0 && SCARG(uap, cmd) == SWAP_STATS) +#else + if (error == 0) +#endif + error = copyout((caddr_t)sdp->swd_path, + (caddr_t)&sep->se_path, + sdp->swd_pathlen); + + if (error) + goto out; + count++; +#if defined(COMPAT_13) + if (SCARG(uap, cmd) == SWAP_OSTATS) + ((struct oswapent *)sep)++; + else +#endif + sep++; + } + } + + UVMHIST_LOG(pdhist, "<- done SWAP_STATS", 0, 0, 0, 0); + + *retval = count; + error = 0; + goto out; + } + + /* + * all other requests require superuser privs. verify. + */ + if ((error = suser(p->p_ucred, &p->p_acflag))) + goto out; + + /* + * at this point we expect a path name in arg. we will + * use namei() to gain a vnode reference (vref), and lock + * the vnode (VOP_LOCK). + * + * XXX: a NULL arg means use the root vnode pointer (e.g. for + * miniroot) + */ + if (SCARG(uap, arg) == NULL) { + vp = rootvp; /* miniroot */ + if (vget(vp, LK_EXCLUSIVE)) { + error = EBUSY; + goto out; + } + if (SCARG(uap, cmd) == SWAP_ON && + copystr("miniroot", userpath, sizeof userpath, &len)) + panic("swapctl: miniroot copy failed"); + } else { + int space; + char *where; + + if (SCARG(uap, cmd) == SWAP_ON) { + if ((error = copyinstr(SCARG(uap, arg), userpath, + sizeof userpath, &len))) + goto out; + space = UIO_SYSSPACE; + where = userpath; + } else { + space = UIO_USERSPACE; + where = (char *)SCARG(uap, arg); + } + NDINIT(&nd, LOOKUP, FOLLOW|LOCKLEAF, space, where, p); + if ((error = namei(&nd))) + goto out; + vp = nd.ni_vp; + } + /* note: "vp" is referenced and locked */ + + error = 0; /* assume no error */ + switch(SCARG(uap, cmd)) { + case SWAP_CTL: + /* + * get new priority, remove old entry (if any) and then + * reinsert it in the correct place. finally, prune out + * any empty priority structures. + */ + priority = SCARG(uap, misc); + spp = (struct swappri *) + malloc(sizeof *spp, M_VMSWAP, M_WAITOK); + simple_lock(&swap_data_lock); + if ((sdp = swaplist_find(vp, 1)) == NULL) { + error = ENOENT; + } else { + swaplist_insert(sdp, spp, priority); + swaplist_trim(); + } + simple_unlock(&swap_data_lock); + if (error) + free(spp, M_VMSWAP); + break; + + case SWAP_ON: + /* + * check for duplicates. if none found, then insert a + * dummy entry on the list to prevent someone else from + * trying to enable this device while we are working on + * it. + */ + priority = SCARG(uap, misc); + simple_lock(&swap_data_lock); + if ((sdp = swaplist_find(vp, 0)) != NULL) { + error = EBUSY; + simple_unlock(&swap_data_lock); + break; + } + sdp = (struct swapdev *) + malloc(sizeof *sdp, M_VMSWAP, M_WAITOK); + spp = (struct swappri *) + malloc(sizeof *spp, M_VMSWAP, M_WAITOK); + bzero(sdp, sizeof(*sdp)); + sdp->swd_flags = SWF_FAKE; /* placeholder only */ + sdp->swd_vp = vp; + sdp->swd_dev = (vp->v_type == VBLK) ? vp->v_rdev : NODEV; +#ifdef SWAP_TO_FILES + /* + * XXX Is NFS elaboration necessary? + */ + if (vp->v_type == VREG) + sdp->swd_cred = crdup(p->p_ucred); +#endif + swaplist_insert(sdp, spp, priority); + simple_unlock(&swap_data_lock); + + sdp->swd_pathlen = len; + sdp->swd_path = malloc(sdp->swd_pathlen, M_VMSWAP, M_WAITOK); + if (copystr(userpath, sdp->swd_path, sdp->swd_pathlen, 0) != 0) + panic("swapctl: copystr"); + /* + * we've now got a FAKE placeholder in the swap list. + * now attempt to enable swap on it. if we fail, undo + * what we've done and kill the fake entry we just inserted. + * if swap_on is a success, it will clear the SWF_FAKE flag + */ + if ((error = swap_on(p, sdp)) != 0) { + simple_lock(&swap_data_lock); + (void) swaplist_find(vp, 1); /* kill fake entry */ + swaplist_trim(); + simple_unlock(&swap_data_lock); +#ifdef SWAP_TO_FILES + if (vp->v_type == VREG) + crfree(sdp->swd_cred); +#endif + free(sdp->swd_path, M_VMSWAP); + free((caddr_t)sdp, M_VMSWAP); + break; + } + + /* + * got it! now add a second reference to vp so that + * we keep a reference to the vnode after we return. + */ + vref(vp); + break; + + case SWAP_OFF: + UVMHIST_LOG(pdhist, "someone is using SWAP_OFF...??", 0,0,0,0); +#ifdef SWAP_OFF_WORKS + /* + * find the entry of interest and ensure it is enabled. + */ + simple_lock(&swap_data_lock); + if ((sdp = swaplist_find(vp, 0)) == NULL) { + simple_unlock(&swap_data_lock); + error = ENXIO; + break; + } + /* + * If a device isn't in use or enabled, we + * can't stop swapping from it (again). + */ + if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) { + simple_unlock(&swap_data_lock); + error = EBUSY; + break; + } + /* XXXCDC: should we call with list locked or unlocked? */ + if ((error = swap_off(p, sdp)) != 0) + break; + /* XXXCDC: might need relock here */ + + /* + * now we can kill the entry. + */ + if ((sdp = swaplist_find(vp, 1)) == NULL) { + error = ENXIO; + break; + } + simple_unlock(&swap_data_lock); + free((caddr_t)sdp, M_VMSWAP); +#else + error = EINVAL; +#endif + break; + + default: + UVMHIST_LOG(pdhist, "unhandled command: %#x", + SCARG(uap, cmd), 0, 0, 0); + error = EINVAL; + } + + /* + * done! use vput to drop our reference and unlock + */ + vput(vp); +out: + lockmgr(&swap_syscall_lock, LK_RELEASE, (void *)0, p); + + UVMHIST_LOG(pdhist, "<- done! error=%d", error, 0, 0, 0); + return (error); +} +#endif + + +/* + * swap_on: attempt to enable a swapdev for swapping. note that the + * swapdev is already on the global list, but disabled (marked + * SWF_FAKE). + * + * => we avoid the start of the disk (to protect disk labels) + * => we also avoid the miniroot, if we are swapping to root. + * => caller should leave swap_data_lock unlocked, we may lock it + * if needed. + */ +#ifdef notyet /* used by swapctl */ +static int +swap_on(p, sdp) + struct proc *p; + struct swapdev *sdp; +{ + static int count = 0; /* static */ + struct vnode *vp; + int error, npages, nblocks, size; + long addr; +#ifdef SWAP_TO_FILES + struct vattr va; +#endif +#ifdef NFS + extern int (**nfsv2_vnodeop_p) __P((void *)); +#endif /* NFS */ + dev_t dev; + char *name; + UVMHIST_FUNC("swap_on"); UVMHIST_CALLED(pdhist); + + /* + * we want to enable swapping on sdp. the swd_vp contains + * the vnode we want (locked and ref'd), and the swd_dev + * contains the dev_t of the file, if it a block device. + */ + + vp = sdp->swd_vp; + dev = sdp->swd_dev; + + /* + * open the swap file (mostly useful for block device files to + * let device driver know what is up). + * + * we skip the open/close for root on swap because the root + * has already been opened when root was mounted (mountroot). + */ + if (vp != rootvp) { + if ((error = VOP_OPEN(vp, FREAD|FWRITE, p->p_ucred, p))) + return (error); + } + + /* XXX this only works for block devices */ + UVMHIST_LOG(pdhist, " dev=%d, major(dev)=%d", dev, major(dev), 0,0); + + /* + * we now need to determine the size of the swap area. for + * block specials we can call the d_psize function. + * for normal files, we must stat [get attrs]. + * + * we put the result in nblks. + * for normal files, we also want the filesystem block size + * (which we get with statfs). + */ + switch (vp->v_type) { + case VBLK: + if (bdevsw[major(dev)].d_psize == 0 || + (nblocks = (*bdevsw[major(dev)].d_psize)(dev)) == -1) { + error = ENXIO; + goto bad; + } + break; + +#ifdef SWAP_TO_FILES + case VREG: + if ((error = VOP_GETATTR(vp, &va, p->p_ucred, p))) + goto bad; + nblocks = (int)btodb(va.va_size); + if ((error = + VFS_STATFS(vp->v_mount, &vp->v_mount->mnt_stat, p)) != 0) + goto bad; + + sdp->swd_bsize = vp->v_mount->mnt_stat.f_iosize; + /* + * limit the max # of outstanding I/O requests we issue + * at any one time. take it easy on NFS servers. + */ +#ifdef NFS + if (vp->v_op == nfsv2_vnodeop_p) + sdp->swd_maxactive = 2; /* XXX */ + else +#endif /* NFS */ + sdp->swd_maxactive = 8; /* XXX */ + break; +#endif + + default: + error = ENXIO; + goto bad; + } + + /* + * save nblocks in a safe place and convert to pages. + */ + + sdp->swd_ose.ose_nblks = nblocks; + npages = dbtob((u_int64_t)nblocks) >> PAGE_SHIFT; + + /* + * for block special files, we want to make sure that leave + * the disklabel and bootblocks alone, so we arrange to skip + * over them (randomly choosing to skip PAGE_SIZE bytes). + * note that because of this the "size" can be less than the + * actual number of blocks on the device. + */ + if (vp->v_type == VBLK) { + /* we use pages 1 to (size - 1) [inclusive] */ + size = npages - 1; + addr = 1; + } else { + /* we use pages 0 to (size - 1) [inclusive] */ + size = npages; + addr = 0; + } + + /* + * make sure we have enough blocks for a reasonable sized swap + * area. we want at least one page. + */ + + if (size < 1) { + UVMHIST_LOG(pdhist, " size <= 1!!", 0, 0, 0, 0); + error = EINVAL; + goto bad; + } + + UVMHIST_LOG(pdhist, " dev=%x: size=%d addr=%ld\n", dev, size, addr, 0); + + /* + * now we need to allocate an extent to manage this swap device + */ + name = malloc(12, M_VMSWAP, M_WAITOK); + sprintf(name, "swap0x%04x", count++); + + /* note that extent_create's 3rd arg is inclusive, thus "- 1" */ + sdp->swd_ex = extent_create(name, 0, npages - 1, M_VMSWAP, + 0, 0, EX_WAITOK); + /* allocate the `saved' region from the extent so it won't be used */ + if (addr) { + if (extent_alloc_region(sdp->swd_ex, 0, addr, EX_WAITOK)) + panic("disklabel region"); + sdp->swd_npginuse += addr; + uvmexp.swpginuse += addr; + } + + + /* + * if the vnode we are swapping to is the root vnode + * (i.e. we are swapping to the miniroot) then we want + * to make sure we don't overwrite it. do a statfs to + * find its size and skip over it. + */ + if (vp == rootvp) { + struct mount *mp; + struct statfs *sp; + int rootblocks, rootpages; + + mp = rootvnode->v_mount; + sp = &mp->mnt_stat; + rootblocks = sp->f_blocks * btodb(sp->f_bsize); + rootpages = round_page(dbtob(rootblocks)) >> PAGE_SHIFT; + if (rootpages > npages) + panic("swap_on: miniroot larger than swap?"); + + if (extent_alloc_region(sdp->swd_ex, addr, + rootpages, EX_WAITOK)) + panic("swap_on: unable to preserve miniroot"); + + sdp->swd_npginuse += (rootpages - addr); + uvmexp.swpginuse += (rootpages - addr); + + printf("Preserved %d pages of miniroot ", rootpages); + printf("leaving %d pages of swap\n", size - rootpages); + } + + /* + * now add the new swapdev to the drum and enable. + */ + simple_lock(&swap_data_lock); + swapdrum_add(sdp, npages); + sdp->swd_npages = npages; + sdp->swd_flags &= ~SWF_FAKE; /* going live */ + sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE); + simple_unlock(&swap_data_lock); + uvmexp.swpages += npages; + + /* + * add anon's to reflect the swap space we added + */ + uvm_anon_add(size); + +#if 0 + /* + * At this point we could arrange to reserve memory for the + * swap buffer pools. + * + * I don't think this is necessary, since swapping starts well + * ahead of serious memory deprivation and the memory resource + * pools hold on to actively used memory. This should ensure + * we always have some resources to continue operation. + */ + + int s = splbio(); + int n = 8 * sdp->swd_maxactive; + + (void)pool_prime(swapbuf_pool, n, 0); + + if (vp->v_type == VREG) { + /* Allocate additional vnx and vnd buffers */ + /* + * Allocation Policy: + * (8 * swd_maxactive) vnx headers per swap dev + * (16 * swd_maxactive) vnd buffers per swap dev + */ + + n = 8 * sdp->swd_maxactive; + (void)pool_prime(vndxfer_pool, n, 0); + + n = 16 * sdp->swd_maxactive; + (void)pool_prime(vndbuf_pool, n, 0); + } + splx(s); +#endif + + return (0); + +bad: + /* + * failure: close device if necessary and return error. + */ + if (vp != rootvp) + (void)VOP_CLOSE(vp, FREAD|FWRITE, p->p_ucred, p); + return (error); +} +#endif + +#ifdef SWAP_OFF_WORKS +/* + * swap_off: stop swapping on swapdev + * + * XXXCDC: what conditions go here? + */ +static int +swap_off(p, sdp) + struct proc *p; + struct swapdev *sdp; +{ + char *name; + UVMHIST_FUNC("swap_off"); UVMHIST_CALLED(pdhist); + + /* turn off the enable flag */ + sdp->swd_flags &= ~SWF_ENABLE; + + UVMHIST_LOG(pdhist, " dev=%x", sdp->swd_dev); + + /* + * XXX write me + * + * the idea is to find out which processes are using this swap + * device, and page them all in. + * + * eventually, we should try to move them out to other swap areas + * if available. + * + * The alternative is to create a redirection map for this swap + * device. This should work by moving all the pages of data from + * the ex-swap device to another one, and making an entry in the + * redirection map for it. locking is going to be important for + * this! + * + * XXXCDC: also need to shrink anon pool + */ + + /* until the above code is written, we must ENODEV */ + return ENODEV; + + extent_free(swapmap, sdp->swd_mapoffset, sdp->swd_mapsize, EX_WAITOK); + name = sdp->swd_ex->ex_name; + extent_destroy(sdp->swd_ex); + free(name, M_VMSWAP); + free((caddr_t)sdp->swd_ex, M_VMSWAP); + if (sdp->swp_vp != rootvp) + (void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, p->p_ucred, p); + if (sdp->swd_vp) + vrele(sdp->swd_vp); + free((caddr_t)sdp, M_VMSWAP); + return (0); +} +#endif + +/* + * /dev/drum interface and i/o functions + */ + +/* + * swread: the read function for the drum (just a call to physio) + */ +/*ARGSUSED*/ +int +swread(dev, uio, ioflag) + dev_t dev; + struct uio *uio; + int ioflag; +{ + UVMHIST_FUNC("swread"); UVMHIST_CALLED(pdhist); + + UVMHIST_LOG(pdhist, " dev=%x offset=%qx", dev, uio->uio_offset, 0, 0); + return (physio(swstrategy, NULL, dev, B_READ, minphys, uio)); +} + +/* + * swwrite: the write function for the drum (just a call to physio) + */ +/*ARGSUSED*/ +int +swwrite(dev, uio, ioflag) + dev_t dev; + struct uio *uio; + int ioflag; +{ + UVMHIST_FUNC("swwrite"); UVMHIST_CALLED(pdhist); + + UVMHIST_LOG(pdhist, " dev=%x offset=%qx", dev, uio->uio_offset, 0, 0); + return (physio(swstrategy, NULL, dev, B_WRITE, minphys, uio)); +} + +/* + * swstrategy: perform I/O on the drum + * + * => we must map the i/o request from the drum to the correct swapdev. + */ +void +swstrategy(bp) + struct buf *bp; +{ + struct swapdev *sdp; + struct vnode *vp; + int pageno; + int bn; + UVMHIST_FUNC("swstrategy"); UVMHIST_CALLED(pdhist); + + /* + * convert block number to swapdev. note that swapdev can't + * be yanked out from under us because we are holding resources + * in it (i.e. the blocks we are doing I/O on). + */ + pageno = dbtob(bp->b_blkno) >> PAGE_SHIFT; + simple_lock(&swap_data_lock); + sdp = swapdrum_getsdp(pageno); + simple_unlock(&swap_data_lock); + if (sdp == NULL) { + bp->b_error = EINVAL; + bp->b_flags |= B_ERROR; + biodone(bp); + UVMHIST_LOG(pdhist, " failed to get swap device", 0, 0, 0, 0); + return; + } + + /* + * convert drum page number to block number on this swapdev. + */ + + pageno = pageno - sdp->swd_drumoffset; /* page # on swapdev */ + bn = btodb(pageno << PAGE_SHIFT); /* convert to diskblock */ + + UVMHIST_LOG(pdhist, " %s: mapoff=%x bn=%x bcount=%ld\n", + ((bp->b_flags & B_READ) == 0) ? "write" : "read", + sdp->swd_drumoffset, bn, bp->b_bcount); + + + /* + * for block devices we finish up here. + * for regular files we have to do more work which we deligate + * to sw_reg_strategy(). + */ + + switch (sdp->swd_vp->v_type) { + default: + panic("swstrategy: vnode type 0x%x", sdp->swd_vp->v_type); + case VBLK: + + /* + * must convert "bp" from an I/O on /dev/drum to an I/O + * on the swapdev (sdp). + */ + bp->b_blkno = bn; /* swapdev block number */ + vp = sdp->swd_vp; /* swapdev vnode pointer */ + bp->b_dev = sdp->swd_dev; /* swapdev dev_t */ + VHOLD(vp); /* "hold" swapdev vp for i/o */ + + /* + * if we are doing a write, we have to redirect the i/o on + * drum's v_numoutput counter to the swapdevs. + */ + if ((bp->b_flags & B_READ) == 0) { + int s = splbio(); + vwakeup(bp); /* kills one 'v_numoutput' on drum */ + vp->v_numoutput++; /* put it on swapdev */ + splx(s); + } + + /* + * dissassocate buffer with /dev/drum vnode + * [could be null if buf was from physio] + */ + if (bp->b_vp != NULLVP) + brelvp(bp); + + /* + * finally plug in swapdev vnode and start I/O + */ + bp->b_vp = vp; + VOP_STRATEGY(bp); + return; +#ifdef SWAP_TO_FILES + case VREG: + /* + * deligate to sw_reg_strategy function. + */ + sw_reg_strategy(sdp, bp, bn); + return; +#endif + } + /* NOTREACHED */ +} + +#ifdef SWAP_TO_FILES +/* + * sw_reg_strategy: handle swap i/o to regular files + */ +static void +sw_reg_strategy(sdp, bp, bn) + struct swapdev *sdp; + struct buf *bp; + int bn; +{ + struct vnode *vp; + struct vndxfer *vnx; + daddr_t nbn, byteoff; + caddr_t addr; + int s, off, nra, error, sz, resid; + UVMHIST_FUNC("sw_reg_strategy"); UVMHIST_CALLED(pdhist); + + /* + * allocate a vndxfer head for this transfer and point it to + * our buffer. + */ + getvndxfer(vnx); + vnx->vx_flags = VX_BUSY; + vnx->vx_error = 0; + vnx->vx_pending = 0; + vnx->vx_bp = bp; + vnx->vx_sdp = sdp; + + /* + * setup for main loop where we read filesystem blocks into + * our buffer. + */ + error = 0; + bp->b_resid = bp->b_bcount; /* nothing transfered yet! */ + addr = bp->b_data; /* current position in buffer */ + byteoff = dbtob(bn); + + for (resid = bp->b_resid; resid; resid -= sz) { + struct vndbuf *nbp; + + /* + * translate byteoffset into block number. return values: + * vp = vnode of underlying device + * nbn = new block number (on underlying vnode dev) + * nra = num blocks we can read-ahead (excludes requested + * block) + */ + nra = 0; + error = VOP_BMAP(sdp->swd_vp, byteoff / sdp->swd_bsize, + &vp, &nbn, &nra); + + if (error == 0 && (long)nbn == -1) { + /* + * this used to just set error, but that doesn't + * do the right thing. Instead, it causes random + * memory errors. The panic() should remain until + * this condition doesn't destabilize the system. + */ +#if 1 + panic("sw_reg_strategy: swap to sparse file"); +#else + error = EIO; /* failure */ +#endif + } + + /* + * punt if there was an error or a hole in the file. + * we must wait for any i/o ops we have already started + * to finish before returning. + * + * XXX we could deal with holes here but it would be + * a hassle (in the write case). + */ + if (error) { + s = splbio(); + vnx->vx_error = error; /* pass error up */ + goto out; + } + + /* + * compute the size ("sz") of this transfer (in bytes). + * XXXCDC: ignores read-ahead for non-zero offset + */ + if ((off = (byteoff % sdp->swd_bsize)) != 0) + sz = sdp->swd_bsize - off; + else + sz = (1 + nra) * sdp->swd_bsize; + + if (resid < sz) + sz = resid; + + UVMHIST_LOG(pdhist, "sw_reg_strategy: vp %p/%p offset 0x%x/0x%x", + sdp->swd_vp, vp, byteoff, nbn); + + /* + * now get a buf structure. note that the vb_buf is + * at the front of the nbp structure so that you can + * cast pointers between the two structure easily. + */ + getvndbuf(nbp); + nbp->vb_buf.b_flags = bp->b_flags | B_CALL; + nbp->vb_buf.b_bcount = sz; +#if 0 + nbp->vb_buf.b_bufsize = bp->b_bufsize; /* XXXCDC: really? */ +#endif + nbp->vb_buf.b_bufsize = sz; + nbp->vb_buf.b_error = 0; + nbp->vb_buf.b_data = addr; + nbp->vb_buf.b_blkno = nbn + btodb(off); + nbp->vb_buf.b_proc = bp->b_proc; + nbp->vb_buf.b_iodone = sw_reg_iodone; + nbp->vb_buf.b_vp = NULLVP; + nbp->vb_buf.b_vnbufs.le_next = NOLIST; + nbp->vb_buf.b_rcred = sdp->swd_cred; + nbp->vb_buf.b_wcred = sdp->swd_cred; + + /* + * set b_dirtyoff/end and b_validoff/end. this is + * required by the NFS client code (otherwise it will + * just discard our I/O request). + */ + if (bp->b_dirtyend == 0) { + nbp->vb_buf.b_dirtyoff = 0; + nbp->vb_buf.b_dirtyend = sz; + } else { + nbp->vb_buf.b_dirtyoff = + max(0, bp->b_dirtyoff - (bp->b_bcount-resid)); + nbp->vb_buf.b_dirtyend = + min(sz, + max(0, bp->b_dirtyend - (bp->b_bcount-resid))); + } + if (bp->b_validend == 0) { + nbp->vb_buf.b_validoff = 0; + nbp->vb_buf.b_validend = sz; + } else { + nbp->vb_buf.b_validoff = + max(0, bp->b_validoff - (bp->b_bcount-resid)); + nbp->vb_buf.b_validend = + min(sz, + max(0, bp->b_validend - (bp->b_bcount-resid))); + } + + nbp->vb_xfer = vnx; /* patch it back in to vnx */ + + /* + * Just sort by block number + */ + nbp->vb_buf.b_cylinder = nbp->vb_buf.b_blkno; + s = splbio(); + if (vnx->vx_error != 0) { + putvndbuf(nbp); + goto out; + } + vnx->vx_pending++; + + /* assoc new buffer with underlying vnode */ + bgetvp(vp, &nbp->vb_buf); + + /* sort it in and start I/O if we are not over our limit */ + disksort(&sdp->swd_tab, &nbp->vb_buf); + sw_reg_start(sdp); + splx(s); + + /* + * advance to the next I/O + */ + byteoff += sz; + addr += sz; + } + + s = splbio(); + +out: /* Arrive here at splbio */ + vnx->vx_flags &= ~VX_BUSY; + if (vnx->vx_pending == 0) { + if (vnx->vx_error != 0) { + bp->b_error = vnx->vx_error; + bp->b_flags |= B_ERROR; + } + putvndxfer(vnx); + biodone(bp); + } + splx(s); +} + +/* + * sw_reg_start: start an I/O request on the requested swapdev + * + * => reqs are sorted by disksort (above) + */ +static void +sw_reg_start(sdp) + struct swapdev *sdp; +{ + struct buf *bp; + UVMHIST_FUNC("sw_reg_start"); UVMHIST_CALLED(pdhist); + + /* recursion control */ + if ((sdp->swd_flags & SWF_BUSY) != 0) + return; + + sdp->swd_flags |= SWF_BUSY; + + while (sdp->swd_tab.b_active < sdp->swd_maxactive) { + bp = sdp->swd_tab.b_actf; + if (bp == NULL) + break; + sdp->swd_tab.b_actf = bp->b_actf; + sdp->swd_tab.b_active++; + + UVMHIST_LOG(pdhist, + "sw_reg_start: bp %p vp %p blkno %p cnt %lx", + bp, bp->b_vp, bp->b_blkno, bp->b_bcount); + if ((bp->b_flags & B_READ) == 0) + bp->b_vp->v_numoutput++; + VOP_STRATEGY(bp); + } + sdp->swd_flags &= ~SWF_BUSY; +} + +/* + * sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup + * + * => note that we can recover the vndbuf struct by casting the buf ptr + */ +static void +sw_reg_iodone(bp) + struct buf *bp; +{ + struct vndbuf *vbp = (struct vndbuf *) bp; + struct vndxfer *vnx = vbp->vb_xfer; + struct buf *pbp = vnx->vx_bp; /* parent buffer */ + struct swapdev *sdp = vnx->vx_sdp; + int s, resid; + UVMHIST_FUNC("sw_reg_iodone"); UVMHIST_CALLED(pdhist); + + UVMHIST_LOG(pdhist, " vbp=%p vp=%p blkno=%x addr=%p", + vbp, vbp->vb_buf.b_vp, vbp->vb_buf.b_blkno, vbp->vb_buf.b_data); + UVMHIST_LOG(pdhist, " cnt=%lx resid=%lx", + vbp->vb_buf.b_bcount, vbp->vb_buf.b_resid, 0, 0); + + /* + * protect vbp at splbio and update. + */ + + s = splbio(); + resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid; + pbp->b_resid -= resid; + vnx->vx_pending--; + + if (vbp->vb_buf.b_error) { + UVMHIST_LOG(pdhist, " got error=%d !", + vbp->vb_buf.b_error, 0, 0, 0); + + /* pass error upward */ + vnx->vx_error = vbp->vb_buf.b_error; + } + + /* + * drop "hold" reference to vnode (if one) + * XXXCDC: always set to NULLVP, this is useless, right? + */ + if (vbp->vb_buf.b_vp != NULLVP) + brelvp(&vbp->vb_buf); + + /* + * kill vbp structure + */ + putvndbuf(vbp); + + /* + * wrap up this transaction if it has run to completion or, in + * case of an error, when all auxiliary buffers have returned. + */ + if (vnx->vx_error != 0) { + /* pass error upward */ + pbp->b_flags |= B_ERROR; + pbp->b_error = vnx->vx_error; + if ((vnx->vx_flags & VX_BUSY) == 0 && vnx->vx_pending == 0) { + putvndxfer(vnx); + biodone(pbp); + } + } else if (pbp->b_resid == 0) { +#ifdef DIAGNOSTIC + if (vnx->vx_pending != 0) + panic("sw_reg_iodone: vnx pending: %d",vnx->vx_pending); +#endif + + if ((vnx->vx_flags & VX_BUSY) == 0) { + UVMHIST_LOG(pdhist, " iodone error=%d !", + pbp, vnx->vx_error, 0, 0); + putvndxfer(vnx); + biodone(pbp); + } + } + + /* + * done! start next swapdev I/O if one is pending + */ + sdp->swd_tab.b_active--; + sw_reg_start(sdp); + + splx(s); +} +#endif /* SWAP_TO_FILES */ + + +/* + * uvm_swap_alloc: allocate space on swap + * + * => allocation is done "round robin" down the priority list, as we + * allocate in a priority we "rotate" the circle queue. + * => space can be freed with uvm_swap_free + * => we return the page slot number in /dev/drum (0 == invalid slot) + * => we lock swap_data_lock + * => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM + */ +int +uvm_swap_alloc(nslots, lessok) + int *nslots; /* IN/OUT */ + boolean_t lessok; +{ + struct swapdev *sdp; + struct swappri *spp; + u_long result; + UVMHIST_FUNC("uvm_swap_alloc"); UVMHIST_CALLED(pdhist); + + /* + * no swap devices configured yet? definite failure. + */ + if (uvmexp.nswapdev < 1) + return 0; + + /* + * lock data lock, convert slots into blocks, and enter loop + */ + simple_lock(&swap_data_lock); + +ReTry: /* XXXMRG */ + for (spp = swap_priority.lh_first; spp != NULL; + spp = spp->spi_swappri.le_next) { + for (sdp = spp->spi_swapdev.cqh_first; + sdp != (void *)&spp->spi_swapdev; + sdp = sdp->swd_next.cqe_next) { + /* if it's not enabled, then we can't swap from it */ + if ((sdp->swd_flags & SWF_ENABLE) == 0) + continue; + if (sdp->swd_npginuse + *nslots > sdp->swd_npages) + continue; + if (extent_alloc(sdp->swd_ex, *nslots, EX_NOALIGN, + EX_NOBOUNDARY, EX_MALLOCOK|EX_NOWAIT, + &result) != 0) { + continue; + } + + /* + * successful allocation! now rotate the circleq. + */ + CIRCLEQ_REMOVE(&spp->spi_swapdev, sdp, swd_next); + CIRCLEQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next); + sdp->swd_npginuse += *nslots; + uvmexp.swpginuse += *nslots; + simple_unlock(&swap_data_lock); + /* done! return drum slot number */ + UVMHIST_LOG(pdhist, + "success! returning %d slots starting at %d", + *nslots, result + sdp->swd_drumoffset, 0, 0); +#if 0 +{ + struct swapdev *sdp2; + + sdp2 = swapdrum_getsdp(result + sdp->swd_drumoffset); + if (sdp2 == NULL) { +printf("uvm_swap_alloc: nslots=%d, dev=%x, drumoff=%d, result=%ld", + *nslots, sdp->swd_dev, sdp->swd_drumoffset, result); +panic("uvm_swap_alloc: allocating unmapped swap block!"); + } +} +#endif + return(result + sdp->swd_drumoffset); + } + } + + /* XXXMRG: BEGIN HACK */ + if (*nslots > 1 && lessok) { + *nslots = 1; + goto ReTry; /* XXXMRG: ugh! extent should support this for us */ + } + /* XXXMRG: END HACK */ + + simple_unlock(&swap_data_lock); + return 0; /* failed */ +} + +/* + * uvm_swap_free: free swap slots + * + * => this can be all or part of an allocation made by uvm_swap_alloc + * => we lock swap_data_lock + */ +void +uvm_swap_free(startslot, nslots) + int startslot; + int nslots; +{ + struct swapdev *sdp; + UVMHIST_FUNC("uvm_swap_free"); UVMHIST_CALLED(pdhist); + + UVMHIST_LOG(pdhist, "freeing %d slots starting at %d", nslots, + startslot, 0, 0); + /* + * convert drum slot offset back to sdp, free the blocks + * in the extent, and return. must hold pri lock to do + * lookup and access the extent. + */ + simple_lock(&swap_data_lock); + sdp = swapdrum_getsdp(startslot); + +#ifdef DIAGNOSTIC + if (uvmexp.nswapdev < 1) + panic("uvm_swap_free: uvmexp.nswapdev < 1\n"); + if (sdp == NULL) { + printf("uvm_swap_free: startslot %d, nslots %d\n", startslot, + nslots); + panic("uvm_swap_free: unmapped address\n"); + } +#endif + if (extent_free(sdp->swd_ex, startslot - sdp->swd_drumoffset, nslots, + EX_MALLOCOK|EX_NOWAIT) != 0) + printf("warning: resource shortage: %d slots of swap lost\n", + nslots); + + sdp->swd_npginuse -= nslots; + uvmexp.swpginuse -= nslots; +#ifdef DIAGNOSTIC + if (sdp->swd_npginuse < 0) + panic("uvm_swap_free: inuse < 0"); +#endif + simple_unlock(&swap_data_lock); +} + +/* + * uvm_swap_put: put any number of pages into a contig place on swap + * + * => can be sync or async + * => XXXMRG: consider making it an inline or macro + */ +int +uvm_swap_put(swslot, ppsp, npages, flags) + int swslot; + struct vm_page **ppsp; + int npages; + int flags; +{ + int result; + +#if 0 + flags |= PGO_SYNCIO; /* XXXMRG: tmp, force sync */ +#endif + + result = uvm_swap_io(ppsp, swslot, npages, B_WRITE | + ((flags & PGO_SYNCIO) ? 0 : B_ASYNC)); + + return (result); +} + +/* + * uvm_swap_get: get a single page from swap + * + * => usually a sync op (from fault) + * => XXXMRG: consider making it an inline or macro + */ +int +uvm_swap_get(page, swslot, flags) + struct vm_page *page; + int swslot, flags; +{ + int result; + + uvmexp.nswget++; +#ifdef DIAGNOSTIC + if ((flags & PGO_SYNCIO) == 0) + printf("uvm_swap_get: ASYNC get requested?\n"); +#endif + + result = uvm_swap_io(&page, swslot, 1, B_READ | + ((flags & PGO_SYNCIO) ? 0 : B_ASYNC)); + + return (result); +} + +/* + * uvm_swap_io: do an i/o operation to swap + */ + +static int +uvm_swap_io(pps, startslot, npages, flags) + struct vm_page **pps; + int startslot, npages, flags; +{ + daddr_t startblk; + struct swapbuf *sbp; + struct buf *bp; + vaddr_t kva; + int result, s, waitf, pflag; + UVMHIST_FUNC("uvm_swap_io"); UVMHIST_CALLED(pdhist); + + UVMHIST_LOG(pdhist, "<- called, startslot=%d, npages=%d, flags=%d", + startslot, npages, flags, 0); + /* + * convert starting drum slot to block number + */ + startblk = btodb(startslot << PAGE_SHIFT); + + /* + * first, map the pages into the kernel (XXX: currently required + * by buffer system). note that we don't let pagermapin alloc + * an aiodesc structure because we don't want to chance a malloc. + * we've got our own pool of aiodesc structures (in swapbuf). + */ + waitf = (flags & B_ASYNC) ? M_NOWAIT : M_WAITOK; + kva = uvm_pagermapin(pps, npages, NULL, waitf); + if (kva == NULL) + return (VM_PAGER_AGAIN); + + /* + * now allocate a swap buffer off of freesbufs + * [make sure we don't put the pagedaemon to sleep...] + */ + s = splbio(); + pflag = ((flags & B_ASYNC) != 0 || curproc == uvm.pagedaemon_proc) + ? 0 + : PR_WAITOK; + sbp = pool_get(swapbuf_pool, pflag); + splx(s); /* drop splbio */ + + /* + * if we failed to get a swapbuf, return "try again" + */ + if (sbp == NULL) + return (VM_PAGER_AGAIN); + + /* + * fill in the bp/sbp. we currently route our i/o through + * /dev/drum's vnode [swapdev_vp]. + */ + bp = &sbp->sw_buf; + bp->b_flags = B_BUSY | B_NOCACHE | (flags & (B_READ|B_ASYNC)); + bp->b_proc = &proc0; /* XXX */ + bp->b_rcred = bp->b_wcred = proc0.p_ucred; + bp->b_vnbufs.le_next = NOLIST; + bp->b_data = (caddr_t)kva; + bp->b_blkno = startblk; + VHOLD(swapdev_vp); + bp->b_vp = swapdev_vp; + /* XXXCDC: isn't swapdev_vp always a VCHR? */ + /* XXXMRG: probably -- this is obviously something inherited... */ + if (swapdev_vp->v_type == VBLK) + bp->b_dev = swapdev_vp->v_rdev; + bp->b_bcount = npages << PAGE_SHIFT; + + /* + * for pageouts we must set "dirtyoff" [NFS client code needs it]. + * and we bump v_numoutput (counter of number of active outputs). + */ + if ((bp->b_flags & B_READ) == 0) { + bp->b_dirtyoff = 0; + bp->b_dirtyend = npages << PAGE_SHIFT; + s = splbio(); + swapdev_vp->v_numoutput++; + splx(s); + } + + /* + * for async ops we must set up the aiodesc and setup the callback + * XXX: we expect no async-reads, but we don't prevent it here. + */ + if (flags & B_ASYNC) { + sbp->sw_aio.aiodone = uvm_swap_aiodone; + sbp->sw_aio.kva = kva; + sbp->sw_aio.npages = npages; + sbp->sw_aio.pd_ptr = sbp; /* backpointer */ + bp->b_flags |= B_CALL; /* set callback */ + bp->b_iodone = uvm_swap_bufdone;/* "buf" iodone function */ + UVMHIST_LOG(pdhist, "doing async!", 0, 0, 0, 0); + } + UVMHIST_LOG(pdhist, + "about to start io: data = 0x%p blkno = 0x%x, bcount = %ld", + bp->b_data, bp->b_blkno, bp->b_bcount, 0); + + /* + * now we start the I/O, and if async, return. + */ + VOP_STRATEGY(bp); + if (flags & B_ASYNC) + return (VM_PAGER_PEND); + + /* + * must be sync i/o. wait for it to finish + */ + bp->b_error = biowait(bp); + result = (bp->b_flags & B_ERROR) ? VM_PAGER_ERROR : VM_PAGER_OK; + + /* + * kill the pager mapping + */ + uvm_pagermapout(kva, npages); + + /* + * now dispose of the swap buffer + */ + s = splbio(); + bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_PAGET|B_UAREA|B_DIRTY|B_NOCACHE); + if (bp->b_vp) + brelvp(bp); + + pool_put(swapbuf_pool, sbp); + splx(s); + + /* + * finally return. + */ + UVMHIST_LOG(pdhist, "<- done (sync) result=%d", result, 0, 0, 0); + return (result); +} + +/* + * uvm_swap_bufdone: called from the buffer system when the i/o is done + */ +static void +uvm_swap_bufdone(bp) + struct buf *bp; +{ + struct swapbuf *sbp = (struct swapbuf *) bp; + int s = splbio(); + UVMHIST_FUNC("uvm_swap_bufdone"); UVMHIST_CALLED(pdhist); + + UVMHIST_LOG(pdhist, "cleaning buf %p", buf, 0, 0, 0); +#ifdef DIAGNOSTIC + /* + * sanity check: swapbufs are private, so they shouldn't be wanted + */ + if (bp->b_flags & B_WANTED) + panic("uvm_swap_bufdone: private buf wanted"); +#endif + + /* + * drop buffers reference to the vnode and its flags. + */ + bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_PAGET|B_UAREA|B_DIRTY|B_NOCACHE); + if (bp->b_vp) + brelvp(bp); + + /* + * now put the aio on the uvm.aio_done list and wake the + * pagedaemon (which will finish up our job in its context). + */ + simple_lock(&uvm.pagedaemon_lock); /* locks uvm.aio_done */ + TAILQ_INSERT_TAIL(&uvm.aio_done, &sbp->sw_aio, aioq); + simple_unlock(&uvm.pagedaemon_lock); + + thread_wakeup(&uvm.pagedaemon); + splx(s); +} + +/* + * uvm_swap_aiodone: aiodone function for anonymous memory + * + * => this is called in the context of the pagedaemon (but with the + * page queues unlocked!) + * => our "aio" structure must be part of a "swapbuf" + */ +static void +uvm_swap_aiodone(aio) + struct uvm_aiodesc *aio; +{ + struct swapbuf *sbp = aio->pd_ptr; + struct vm_page *pps[MAXBSIZE >> PAGE_SHIFT]; + int lcv, s; + vaddr_t addr; + UVMHIST_FUNC("uvm_swap_aiodone"); UVMHIST_CALLED(pdhist); + + UVMHIST_LOG(pdhist, "done with aio %p", aio, 0, 0, 0); +#ifdef DIAGNOSTIC + /* + * sanity check + */ + if (aio->npages > (MAXBSIZE >> PAGE_SHIFT)) + panic("uvm_swap_aiodone: aio too big!"); +#endif + + /* + * first, we have to recover the page pointers (pps) by poking in the + * kernel pmap (XXX: should be saved in the buf structure). + */ + for (addr = aio->kva, lcv = 0 ; lcv < aio->npages ; + addr += PAGE_SIZE, lcv++) { + pps[lcv] = uvm_pageratop(addr); + } + + /* + * now we can dispose of the kernel mappings of the buffer + */ + uvm_pagermapout(aio->kva, aio->npages); + + /* + * now we can dispose of the pages by using the dropcluster function + * [note that we have no "page of interest" so we pass in null] + */ + uvm_pager_dropcluster(NULL, NULL, pps, &aio->npages, + PGO_PDFREECLUST, 0); + + /* + * finally, we can dispose of the swapbuf + */ + s = splbio(); + pool_put(swapbuf_pool, sbp); + splx(s); + + /* + * done! + */ +} |