diff options
author | Artur Grabowski <art@cvs.openbsd.org> | 2001-11-27 05:27:13 +0000 |
---|---|---|
committer | Artur Grabowski <art@cvs.openbsd.org> | 2001-11-27 05:27:13 +0000 |
commit | 8a1845e49f56720cbfccd4c7f5f80ba5b980fdf4 (patch) | |
tree | d4a522dc41cdc79ba48fe761e94663b795da8cc0 /sys | |
parent | 0d68e9b5af14f4bfa04d22dbebab5972ac647b26 (diff) |
Merge in the unified buffer cache code as found in NetBSD 2001/03/10. The
code is written mostly by Chuck Silvers <chuq@chuq.com>/<chs@netbsd.org>.
Tested for the past few weeks by many developers, should be in a pretty stable
state, but will require optimizations and additional cleanups.
Diffstat (limited to 'sys')
72 files changed, 4246 insertions, 2516 deletions
diff --git a/sys/adosfs/advnops.c b/sys/adosfs/advnops.c index 78d237f41e5..19bfdcc5738 100644 --- a/sys/adosfs/advnops.c +++ b/sys/adosfs/advnops.c @@ -1,4 +1,4 @@ -/* $OpenBSD: advnops.c,v 1.17 2001/06/23 02:14:21 csapuntz Exp $ */ +/* $OpenBSD: advnops.c,v 1.18 2001/11/27 05:27:11 art Exp $ */ /* $NetBSD: advnops.c,v 1.32 1996/10/13 02:52:09 christos Exp $ */ /* @@ -131,7 +131,9 @@ struct vnodeopv_entry_desc adosfs_vnodeop_entries[] = { { &vop_pathconf_desc, adosfs_pathconf }, /* pathconf */ { &vop_advlock_desc, adosfs_advlock }, /* advlock */ { &vop_bwrite_desc, adosfs_bwrite }, /* bwrite */ - { (struct vnodeop_desc*)NULL, (int(*) __P((void *)))NULL } + { &vop_getpages_desc, genfs_getpages }, + { &vop_size_desc, genfs_size }, + { NULL, NULL } }; struct vnodeopv_desc adosfs_vnodeop_opv_desc = @@ -272,6 +274,28 @@ adosfs_read(v) /* * taken from ufs_read() */ + + if (sp->a_vp->v_type == VREG) { + error = 0; + while (uio->uio_resid > 0) { + void *win; + vsize_t bytelen = min(ap->fsize - uio->uio_offset, + uio->uio_resid); + + if (bytelen == 0) { + break; + } + win = ubc_alloc(&sp->a_vp->v_uvm.u_obj, uio->uio_offset, + &bytelen, UBC_READ); + error = uiomove(win, bytelen, uio); + ubc_release(win, 0); + if (error) { + break; + } + } + goto reterr; + } + do { /* * we are only supporting ADosFFS currently diff --git a/sys/arch/alpha/alpha/pmap.c b/sys/arch/alpha/alpha/pmap.c index 9ff390da8c2..1d50a35d446 100644 --- a/sys/arch/alpha/alpha/pmap.c +++ b/sys/arch/alpha/alpha/pmap.c @@ -1,4 +1,4 @@ -/* $OpenBSD: pmap.c,v 1.23 2001/11/09 15:31:11 art Exp $ */ +/* $OpenBSD: pmap.c,v 1.24 2001/11/27 05:27:11 art Exp $ */ /* $NetBSD: pmap.c,v 1.154 2000/12/07 22:18:55 thorpej Exp $ */ /*- @@ -804,8 +804,8 @@ pmap_bootstrap(paddr_t ptaddr, u_int maxasn, u_long ncpuids) /* * Figure out how many PTE's are necessary to map the kernel. */ - lev3mapsize = (VM_PHYS_SIZE + - nbuf * MAXBSIZE + + PAGER_MAP_SIZE + 16 * NCARGS) / NBPG + + lev3mapsize = (VM_PHYS_SIZE + ubc_nwins * ubc_winsize + + nbuf * MAXBSIZE + 16 * NCARGS + PAGER_MAP_SIZE) / NBPG + (maxproc * UPAGES) + NKMEMCLUSTERS; #ifdef SYSVSHM diff --git a/sys/arch/i386/i386/vm_machdep.c b/sys/arch/i386/i386/vm_machdep.c index 516dea6ebea..7de82391532 100644 --- a/sys/arch/i386/i386/vm_machdep.c +++ b/sys/arch/i386/i386/vm_machdep.c @@ -1,4 +1,4 @@ -/* $OpenBSD: vm_machdep.c,v 1.32 2001/11/06 19:53:14 miod Exp $ */ +/* $OpenBSD: vm_machdep.c,v 1.33 2001/11/27 05:27:11 art Exp $ */ /* $NetBSD: vm_machdep.c,v 1.61 1996/05/03 19:42:35 christos Exp $ */ /*- @@ -371,9 +371,7 @@ vmapbuf(bp, len) while (len) { pmap_extract(vm_map_pmap(&bp->b_proc->p_vmspace->vm_map), faddr, &fpa); - pmap_enter(vm_map_pmap(phys_map), taddr, fpa, - VM_PROT_READ | VM_PROT_WRITE, - VM_PROT_READ | VM_PROT_WRITE | PMAP_WIRED); + pmap_kenter_pa(taddr, fpa, VM_PROT_READ|VM_PROT_WRITE); faddr += PAGE_SIZE; taddr += PAGE_SIZE; len -= PAGE_SIZE; @@ -396,6 +394,7 @@ vunmapbuf(bp, len) addr = trunc_page((vaddr_t)bp->b_data); off = (vm_offset_t)bp->b_data - addr; len = round_page(off + len); + pmap_kremove(addr, len); uvm_km_free_wakeup(phys_map, addr, len); bp->b_data = bp->b_saveaddr; bp->b_saveaddr = 0; diff --git a/sys/conf/files b/sys/conf/files index a6ce3bcedb0..0ec11fc5bbb 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -1,4 +1,4 @@ -# $OpenBSD: files,v 1.230 2001/11/21 21:23:56 csapuntz Exp $ +# $OpenBSD: files,v 1.231 2001/11/27 05:27:11 art Exp $ # $NetBSD: files,v 1.87 1996/05/19 17:17:50 jonathan Exp $ # @(#)files.newconf 7.5 (Berkeley) 5/10/93 @@ -801,6 +801,7 @@ file xfs/xfs_syscalls-dummy.c !xfs file uvm/uvm_amap.c file uvm/uvm_anon.c file uvm/uvm_aobj.c +file uvm/uvm_bio.c file uvm/uvm_device.c file uvm/uvm_fault.c file uvm/uvm_glue.c diff --git a/sys/dev/vnd.c b/sys/dev/vnd.c index b2935e0edba..6f8c268a283 100644 --- a/sys/dev/vnd.c +++ b/sys/dev/vnd.c @@ -1,4 +1,4 @@ -/* $OpenBSD: vnd.c,v 1.28 2001/11/15 23:15:15 art Exp $ */ +/* $OpenBSD: vnd.c,v 1.29 2001/11/27 05:27:11 art Exp $ */ /* $NetBSD: vnd.c,v 1.26 1996/03/30 23:06:11 christos Exp $ */ /* @@ -558,10 +558,6 @@ vndstrategy(bp) nbp->vb_buf.b_proc = bp->b_proc; nbp->vb_buf.b_iodone = vndiodone; nbp->vb_buf.b_vp = vp; - nbp->vb_buf.b_dirtyoff = bp->b_dirtyoff; - nbp->vb_buf.b_dirtyend = bp->b_dirtyend; - nbp->vb_buf.b_validoff = bp->b_validoff; - nbp->vb_buf.b_validend = bp->b_validend; LIST_INIT(&nbp->vb_buf.b_dep); /* save a reference to the old buffer */ diff --git a/sys/isofs/cd9660/cd9660_vfsops.c b/sys/isofs/cd9660/cd9660_vfsops.c index b4199c4df15..b2b1455e6eb 100644 --- a/sys/isofs/cd9660/cd9660_vfsops.c +++ b/sys/isofs/cd9660/cd9660_vfsops.c @@ -1,4 +1,4 @@ -/* $OpenBSD: cd9660_vfsops.c,v 1.24 2001/11/15 08:27:28 art Exp $ */ +/* $OpenBSD: cd9660_vfsops.c,v 1.25 2001/11/27 05:27:11 art Exp $ */ /* $NetBSD: cd9660_vfsops.c,v 1.26 1997/06/13 15:38:58 pk Exp $ */ /*- @@ -359,6 +359,8 @@ iso_mountfs(devvp, mp, p, argp) mp->mnt_stat.f_fsid.val[1] = mp->mnt_vfc->vfc_typenum; mp->mnt_maxsymlinklen = 0; mp->mnt_flag |= MNT_LOCAL; + mp->mnt_dev_bshift = iso_bsize; + mp->mnt_fs_bshift = isomp->im_bshift; isomp->im_mountp = mp; isomp->im_dev = dev; isomp->im_devvp = devvp; diff --git a/sys/isofs/cd9660/cd9660_vnops.c b/sys/isofs/cd9660/cd9660_vnops.c index 5f05dc9d65f..cd5567a77b4 100644 --- a/sys/isofs/cd9660/cd9660_vnops.c +++ b/sys/isofs/cd9660/cd9660_vnops.c @@ -1,4 +1,4 @@ -/* $OpenBSD: cd9660_vnops.c,v 1.14 2001/06/23 02:14:23 csapuntz Exp $ */ +/* $OpenBSD: cd9660_vnops.c,v 1.15 2001/11/27 05:27:11 art Exp $ */ /* $NetBSD: cd9660_vnops.c,v 1.42 1997/10/16 23:56:57 christos Exp $ */ /*- @@ -314,9 +314,9 @@ cd9660_read(v) struct ucred *a_cred; } */ *ap = v; struct vnode *vp = ap->a_vp; - register struct uio *uio = ap->a_uio; - register struct iso_node *ip = VTOI(vp); - register struct iso_mnt *imp; + struct uio *uio = ap->a_uio; + struct iso_node *ip = VTOI(vp); + struct iso_mnt *imp; struct buf *bp; daddr_t lbn, rablock; off_t diff; @@ -329,6 +329,26 @@ cd9660_read(v) return (EINVAL); ip->i_flag |= IN_ACCESS; imp = ip->i_mnt; + + if (vp->v_type == VREG) { + error = 0; + while (uio->uio_resid > 0) { + void *win; + vsize_t bytelen = MIN(ip->i_size - uio->uio_offset, + uio->uio_resid); + + if (bytelen == 0) + break; + win = ubc_alloc(&vp->v_uvm.u_obj, uio->uio_offset, + &bytelen, UBC_READ); + error = uiomove(win, bytelen, uio); + ubc_release(win, 0); + if (error) + break; + } + goto out; + } + do { lbn = lblkno(imp, uio->uio_offset); on = blkoff(imp, uio->uio_offset); @@ -370,6 +390,8 @@ cd9660_read(v) bp->b_flags |= B_AGE; brelse(bp); } while (error == 0 && uio->uio_resid > 0 && n != 0); + +out: return (error); } @@ -1045,7 +1067,9 @@ struct vnodeopv_entry_desc cd9660_vnodeop_entries[] = { { &vop_pathconf_desc, cd9660_pathconf },/* pathconf */ { &vop_advlock_desc, cd9660_advlock }, /* advlock */ { &vop_bwrite_desc, vop_generic_bwrite }, - { (struct vnodeop_desc*)NULL, (int(*) __P((void *)))NULL } + { &vop_getpages_desc, genfs_getpages }, + { &vop_size_desc, genfs_size }, + { NULL, NULL } }; struct vnodeopv_desc cd9660_vnodeop_opv_desc = { &cd9660_vnodeop_p, cd9660_vnodeop_entries }; diff --git a/sys/kern/exec_subr.c b/sys/kern/exec_subr.c index 770a29f8adc..e79db64dcae 100644 --- a/sys/kern/exec_subr.c +++ b/sys/kern/exec_subr.c @@ -1,4 +1,4 @@ -/* $OpenBSD: exec_subr.c,v 1.14 2001/11/07 01:18:01 art Exp $ */ +/* $OpenBSD: exec_subr.c,v 1.15 2001/11/27 05:27:11 art Exp $ */ /* $NetBSD: exec_subr.c,v 1.9 1994/12/04 03:10:42 mycroft Exp $ */ /* @@ -167,6 +167,7 @@ vmcmd_map_pagedvn(p, cmd) uobj = uvn_attach((void *) cmd->ev_vp, VM_PROT_READ|VM_PROT_EXECUTE); if (uobj == NULL) return(ENOMEM); + VREF(cmd->ev_vp); /* * do the map diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c index c909a23141b..f807a181062 100644 --- a/sys/kern/init_main.c +++ b/sys/kern/init_main.c @@ -1,4 +1,4 @@ -/* $OpenBSD: init_main.c,v 1.84 2001/11/10 18:42:31 art Exp $ */ +/* $OpenBSD: init_main.c,v 1.85 2001/11/27 05:27:11 art Exp $ */ /* $NetBSD: init_main.c,v 1.84.4.1 1996/06/02 09:08:06 mrg Exp $ */ /* @@ -217,6 +217,8 @@ main(framep) cpu_configure(); + ubc_init(); /* Initialize the unified buffer cache */ + /* Initialize sysctls (must be done before any processes run) */ sysctl_init(); diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c index 2d12034b386..9f621da43d2 100644 --- a/sys/kern/kern_exec.c +++ b/sys/kern/kern_exec.c @@ -1,4 +1,4 @@ -/* $OpenBSD: kern_exec.c,v 1.60 2001/11/12 01:26:09 art Exp $ */ +/* $OpenBSD: kern_exec.c,v 1.61 2001/11/27 05:27:11 art Exp $ */ /* $NetBSD: kern_exec.c,v 1.75 1996/02/09 18:59:28 christos Exp $ */ /*- @@ -150,6 +150,7 @@ check_exec(p, epp) goto bad1; /* now we have the file, get the exec header */ + uvn_attach(vp, VM_PROT_READ); error = vn_rdwr(UIO_READ, vp, epp->ep_hdr, epp->ep_hdrlen, 0, UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid, p); if (error) diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index 71674e95236..6f361c989c0 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -1,4 +1,4 @@ -/* $OpenBSD: vfs_bio.c,v 1.51 2001/11/15 23:25:37 art Exp $ */ +/* $OpenBSD: vfs_bio.c,v 1.52 2001/11/27 05:27:11 art Exp $ */ /* $NetBSD: vfs_bio.c,v 1.44 1996/06/11 11:15:36 pk Exp $ */ /*- @@ -406,7 +406,6 @@ bwrite(bp) /* Initiate disk write. Make sure the appropriate party is charged. */ bp->b_vp->v_numoutput++; splx(s); - SET(bp->b_flags, B_WRITEINPROG); VOP_STRATEGY(bp); if (async) @@ -466,7 +465,6 @@ bdwrite(bp) } /* Otherwise, the "write" is done, so mark and release the buffer. */ - CLR(bp->b_flags, B_NEEDCOMMIT); SET(bp->b_flags, B_DONE); brelse(bp); } @@ -588,6 +586,7 @@ brelse(bp) /* Unlock the buffer. */ CLR(bp->b_flags, (B_AGE | B_ASYNC | B_BUSY | B_NOCACHE | B_DEFERRED)); + SET(bp->b_flags, B_CACHE); /* Allow disk interrupts. */ splx(s); @@ -651,44 +650,30 @@ getblk(vp, blkno, size, slpflag, slptimeo) daddr_t blkno; int size, slpflag, slptimeo; { - struct bufhashhdr *bh; struct buf *bp, *nbp = NULL; int s, err; - /* - * XXX - * The following is an inlined version of 'incore()', but with - * the 'invalid' test moved to after the 'busy' test. It's - * necessary because there are some cases in which the NFS - * code sets B_INVAL prior to writing data to the server, but - * in which the buffers actually contain valid data. In this - * case, we can't allow the system to allocate a new buffer for - * the block until the write is finished. - */ - bh = BUFHASH(vp, blkno); start: - bp = bh->lh_first; - for (; bp != NULL; bp = bp->b_hash.le_next) { - if (bp->b_lblkno != blkno || bp->b_vp != vp) - continue; - + bp = incore(vp, blkno); + if (bp != NULL) { s = splbio(); if (ISSET(bp->b_flags, B_BUSY)) { SET(bp->b_flags, B_WANTED); err = tsleep(bp, slpflag | (PRIBIO + 1), "getblk", slptimeo); splx(s); - if (err) + if (err) { + if (nbp != NULL) { + SET(nbp->b_flags, B_AGE); + brelse(nbp); + } return (NULL); + } goto start; } - if (!ISSET(bp->b_flags, B_INVAL)) { - SET(bp->b_flags, (B_BUSY | B_CACHE)); - bremfree(bp); - splx(s); - break; - } + SET(bp->b_flags, (B_BUSY | B_CACHE)); + bremfree(bp); splx(s); } @@ -697,7 +682,7 @@ start: goto start; } bp = nbp; - binshash(bp, bh); + binshash(bp, BUFHASH(vp, blkno)); bp->b_blkno = bp->b_lblkno = blkno; s = splbio(); bgetvp(vp, bp); @@ -900,8 +885,6 @@ start: bp->b_error = 0; bp->b_resid = 0; bp->b_bcount = 0; - bp->b_dirtyoff = bp->b_dirtyend = 0; - bp->b_validoff = bp->b_validend = 0; bremhash(bp); *bpp = bp; @@ -1022,7 +1005,6 @@ biodone(bp) buf_complete(bp); if (!ISSET(bp->b_flags, B_READ)) { - CLR(bp->b_flags, B_WRITEINPROG); vwakeup(bp->b_vp); } diff --git a/sys/kern/vfs_default.c b/sys/kern/vfs_default.c index 8f426b3a3f5..61f6d0217e9 100644 --- a/sys/kern/vfs_default.c +++ b/sys/kern/vfs_default.c @@ -1,10 +1,9 @@ -/* $OpenBSD: vfs_default.c,v 1.7 2001/06/25 03:28:03 csapuntz Exp $ */ - +/* $OpenBSD: vfs_default.c,v 1.8 2001/11/27 05:27:12 art Exp $ */ /* * Portions of this code are: * - * Copyright (c) 1989, 1993 + * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed @@ -49,9 +48,11 @@ #include <sys/vnode.h> #include <sys/namei.h> #include <sys/malloc.h> +#include <sys/pool.h> #include <sys/event.h> #include <miscfs/specfs/specdev.h> +#include <uvm/uvm.h> extern struct simplelock spechash_slock; @@ -310,3 +311,679 @@ lease_check(void *v) { return (0); } + +/* + * generic VM getpages routine. + * Return PG_BUSY pages for the given range, + * reading from backing store if necessary. + */ + +int +genfs_getpages(v) + void *v; +{ + struct vop_getpages_args /* { + struct vnode *a_vp; + voff_t a_offset; + vm_page_t *a_m; + int *a_count; + int a_centeridx; + vm_prot_t a_access_type; + int a_advice; + int a_flags; + } */ *ap = v; + + off_t newsize, diskeof, memeof; + off_t offset, origoffset, startoffset, endoffset, raoffset; + daddr_t lbn, blkno; + int s, i, error, npages, orignpages, npgs, run, ridx, pidx, pcount; + int fs_bshift, fs_bsize, dev_bshift, dev_bsize; + int flags = ap->a_flags; + size_t bytes, iobytes, tailbytes, totalbytes, skipbytes; + vaddr_t kva; + struct buf *bp, *mbp; + struct vnode *vp = ap->a_vp; + struct uvm_object *uobj = &vp->v_uvm.u_obj; + struct vm_page *pgs[16]; /* XXXUBC 16 */ + struct ucred *cred = curproc->p_ucred; /* XXXUBC curproc */ + boolean_t async = (flags & PGO_SYNCIO) == 0; + boolean_t write = (ap->a_access_type & VM_PROT_WRITE) != 0; + boolean_t sawhole = FALSE; + struct proc *p = curproc; + UVMHIST_FUNC("genfs_getpages"); UVMHIST_CALLED(ubchist); + + UVMHIST_LOG(ubchist, "vp %p off 0x%x/%x count %d", + vp, ap->a_offset >> 32, ap->a_offset, *ap->a_count); + + /* XXXUBC temp limit */ + if (*ap->a_count > 16) { + return EINVAL; + } + + error = 0; + origoffset = ap->a_offset; + orignpages = *ap->a_count; + error = VOP_SIZE(vp, vp->v_uvm.u_size, &diskeof); + if (error) { + return error; + } + if (flags & PGO_PASTEOF) { + newsize = MAX(vp->v_uvm.u_size, + origoffset + (orignpages << PAGE_SHIFT)); + error = VOP_SIZE(vp, newsize, &memeof); + if (error) { + return error; + } + } else { + memeof = diskeof; + } + KASSERT(ap->a_centeridx >= 0 || ap->a_centeridx <= orignpages); + KASSERT((origoffset & (PAGE_SIZE - 1)) == 0 && origoffset >= 0); + KASSERT(orignpages > 0); + + /* + * Bounds-check the request. + */ + + if (origoffset + (ap->a_centeridx << PAGE_SHIFT) >= memeof) { + if ((flags & PGO_LOCKED) == 0) { + simple_unlock(&uobj->vmobjlock); + } + UVMHIST_LOG(ubchist, "off 0x%x count %d goes past EOF 0x%x", + origoffset, *ap->a_count, memeof,0); + return EINVAL; + } + + /* + * For PGO_LOCKED requests, just return whatever's in memory. + */ + + if (flags & PGO_LOCKED) { + uvn_findpages(uobj, origoffset, ap->a_count, ap->a_m, + UFP_NOWAIT|UFP_NOALLOC|UFP_NORDONLY); + + return ap->a_m[ap->a_centeridx] == NULL ? EBUSY : 0; + } + + /* vnode is VOP_LOCKed, uobj is locked */ + + if (write && (vp->v_bioflag & VBIOONSYNCLIST) == 0) { + vn_syncer_add_to_worklist(vp, syncdelay); + } + + /* + * find the requested pages and make some simple checks. + * leave space in the page array for a whole block. + */ + + fs_bshift = vp->v_mount->mnt_fs_bshift; + fs_bsize = 1 << fs_bshift; + dev_bshift = vp->v_mount->mnt_dev_bshift; + dev_bsize = 1 << dev_bshift; + KASSERT((diskeof & (dev_bsize - 1)) == 0); + KASSERT((memeof & (dev_bsize - 1)) == 0); + + orignpages = MIN(orignpages, + round_page(memeof - origoffset) >> PAGE_SHIFT); + npages = orignpages; + startoffset = origoffset & ~(fs_bsize - 1); + endoffset = round_page((origoffset + (npages << PAGE_SHIFT) + + fs_bsize - 1) & ~(fs_bsize - 1)); + endoffset = MIN(endoffset, round_page(memeof)); + ridx = (origoffset - startoffset) >> PAGE_SHIFT; + + memset(pgs, 0, sizeof(pgs)); + uvn_findpages(uobj, origoffset, &npages, &pgs[ridx], UFP_ALL); + + /* + * if PGO_OVERWRITE is set, don't bother reading the pages. + * PGO_OVERWRITE also means that the caller guarantees + * that the pages already have backing store allocated. + */ + + if (flags & PGO_OVERWRITE) { + UVMHIST_LOG(ubchist, "PGO_OVERWRITE",0,0,0,0); + + for (i = 0; i < npages; i++) { + struct vm_page *pg = pgs[ridx + i]; + + if (pg->flags & PG_FAKE) { + uvm_pagezero(pg); + pg->flags &= ~(PG_FAKE); + } + pg->flags &= ~(PG_RDONLY); + } + npages += ridx; + goto out; + } + + /* + * if the pages are already resident, just return them. + */ + + for (i = 0; i < npages; i++) { + struct vm_page *pg = pgs[ridx + i]; + + if ((pg->flags & PG_FAKE) || + (write && (pg->flags & PG_RDONLY))) { + break; + } + } + if (i == npages) { + UVMHIST_LOG(ubchist, "returning cached pages", 0,0,0,0); + raoffset = origoffset + (orignpages << PAGE_SHIFT); + npages += ridx; + goto raout; + } + + /* + * the page wasn't resident and we're not overwriting, + * so we're going to have to do some i/o. + * find any additional pages needed to cover the expanded range. + */ + + if (startoffset != origoffset) { + + /* + * XXXUBC we need to avoid deadlocks caused by locking + * additional pages at lower offsets than pages we + * already have locked. for now, unlock them all and + * start over. + */ + + for (i = 0; i < npages; i++) { + struct vm_page *pg = pgs[ridx + i]; + + if (pg->flags & PG_FAKE) { + pg->flags |= PG_RELEASED; + } + } + uvm_page_unbusy(&pgs[ridx], npages); + memset(pgs, 0, sizeof(pgs)); + + UVMHIST_LOG(ubchist, "reset npages start 0x%x end 0x%x", + startoffset, endoffset, 0,0); + npages = (endoffset - startoffset) >> PAGE_SHIFT; + npgs = npages; + uvn_findpages(uobj, startoffset, &npgs, pgs, UFP_ALL); + } + simple_unlock(&uobj->vmobjlock); + + /* + * read the desired page(s). + */ + + totalbytes = npages << PAGE_SHIFT; + bytes = MIN(totalbytes, MAX(diskeof - startoffset, 0)); + tailbytes = totalbytes - bytes; + skipbytes = 0; + + kva = uvm_pagermapin(pgs, npages, UVMPAGER_MAPIN_WAITOK | + UVMPAGER_MAPIN_READ); + + s = splbio(); + mbp = pool_get(&bufpool, PR_WAITOK); + splx(s); + mbp->b_bufsize = totalbytes; + mbp->b_data = (void *)kva; + mbp->b_resid = mbp->b_bcount = bytes; + mbp->b_flags = B_BUSY|B_READ| (async ? B_CALL : 0); + mbp->b_iodone = uvm_aio_biodone; + mbp->b_vp = vp; + LIST_INIT(&mbp->b_dep); + + /* + * if EOF is in the middle of the range, zero the part past EOF. + */ + + if (tailbytes > 0) { + memset((void *)(kva + bytes), 0, tailbytes); + } + + /* + * now loop over the pages, reading as needed. + */ + + if (write) { + lockmgr(&vp->v_glock, LK_EXCLUSIVE, NULL, p); + } else { + lockmgr(&vp->v_glock, LK_SHARED, NULL, p); + } + + bp = NULL; + for (offset = startoffset; + bytes > 0; + offset += iobytes, bytes -= iobytes) { + + /* + * skip pages which don't need to be read. + */ + + pidx = (offset - startoffset) >> PAGE_SHIFT; + while ((pgs[pidx]->flags & PG_FAKE) == 0) { + size_t b; + + KASSERT((offset & (PAGE_SIZE - 1)) == 0); + b = MIN(PAGE_SIZE, bytes); + offset += b; + bytes -= b; + skipbytes += b; + pidx++; + UVMHIST_LOG(ubchist, "skipping, new offset 0x%x", + offset, 0,0,0); + if (bytes == 0) { + goto loopdone; + } + } + + /* + * bmap the file to find out the blkno to read from and + * how much we can read in one i/o. if bmap returns an error, + * skip the rest of the top-level i/o. + */ + + lbn = offset >> fs_bshift; + error = VOP_BMAP(vp, lbn, NULL, &blkno, &run); + if (error) { + UVMHIST_LOG(ubchist, "VOP_BMAP lbn 0x%x -> %d\n", + lbn, error,0,0); + skipbytes += bytes; + goto loopdone; + } + + /* + * see how many pages can be read with this i/o. + * reduce the i/o size if necessary to avoid + * overwriting pages with valid data. + */ + + iobytes = MIN((((off_t)lbn + 1 + run) << fs_bshift) - offset, + bytes); + if (offset + iobytes > round_page(offset)) { + pcount = 1; + while (pidx + pcount < npages && + pgs[pidx + pcount]->flags & PG_FAKE) { + pcount++; + } + iobytes = MIN(iobytes, (pcount << PAGE_SHIFT) - + (offset - trunc_page(offset))); + } + + /* + * if this block isn't allocated, zero it instead of reading it. + * if this is a read access, mark the pages we zeroed PG_RDONLY. + */ + + if (blkno < 0) { + UVMHIST_LOG(ubchist, "lbn 0x%x -> HOLE", lbn,0,0,0); + + sawhole = TRUE; + memset((char *)kva + (offset - startoffset), 0, + iobytes); + skipbytes += iobytes; + + if (!write) { + int holepages = + (round_page(offset + iobytes) - + trunc_page(offset)) >> PAGE_SHIFT; + for (i = 0; i < holepages; i++) { + pgs[pidx + i]->flags |= PG_RDONLY; + } + } + continue; + } + + /* + * allocate a sub-buf for this piece of the i/o + * (or just use mbp if there's only 1 piece), + * and start it going. + */ + + if (offset == startoffset && iobytes == bytes) { + bp = mbp; + } else { + s = splbio(); + bp = pool_get(&bufpool, PR_WAITOK); + splx(s); + bp->b_data = (char *)kva + offset - startoffset; + bp->b_resid = bp->b_bcount = iobytes; + bp->b_flags = B_BUSY|B_READ|B_CALL; + bp->b_iodone = uvm_aio_biodone1; + bp->b_vp = vp; + LIST_INIT(&bp->b_dep); + } + bp->b_lblkno = 0; + bp->b_private = mbp; + + /* adjust physical blkno for partial blocks */ + bp->b_blkno = blkno + ((offset - ((off_t)lbn << fs_bshift)) >> + dev_bshift); + + UVMHIST_LOG(ubchist, "bp %p offset 0x%x bcount 0x%x blkno 0x%x", + bp, offset, iobytes, bp->b_blkno); + + VOP_STRATEGY(bp); + } + +loopdone: + if (skipbytes) { + s = splbio(); + if (error) { + mbp->b_flags |= B_ERROR; + mbp->b_error = error; + } + mbp->b_resid -= skipbytes; + if (mbp->b_resid == 0) { + biodone(mbp); + } + splx(s); + } + + if (async) { + UVMHIST_LOG(ubchist, "returning PEND",0,0,0,0); + lockmgr(&vp->v_glock, LK_RELEASE, NULL, p); + return EINPROGRESS; + } + if (bp != NULL) { + error = biowait(mbp); + } + s = splbio(); + pool_put(&bufpool, mbp); + splx(s); + uvm_pagermapout(kva, npages); + raoffset = startoffset + totalbytes; + + /* + * if this we encountered a hole then we have to do a little more work. + * for read faults, we marked the page PG_RDONLY so that future + * write accesses to the page will fault again. + * for write faults, we must make sure that the backing store for + * the page is completely allocated while the pages are locked. + */ + + if (error == 0 && sawhole && write) { + error = VOP_BALLOCN(vp, startoffset, npages << PAGE_SHIFT, + cred, 0); + if (error) { + UVMHIST_LOG(ubchist, "balloc lbn 0x%x -> %d", + lbn, error,0,0); + lockmgr(&vp->v_glock, LK_RELEASE, NULL, p); + simple_lock(&uobj->vmobjlock); + goto out; + } + } + lockmgr(&vp->v_glock, LK_RELEASE, NULL, p); + simple_lock(&uobj->vmobjlock); + + /* + * see if we want to start any readahead. + * XXXUBC for now, just read the next 128k on 64k boundaries. + * this is pretty nonsensical, but it is 50% faster than reading + * just the next 64k. + */ + +raout: + if (!error && !async && !write && ((int)raoffset & 0xffff) == 0 && + PAGE_SHIFT <= 16) { + int racount; + + racount = 1 << (16 - PAGE_SHIFT); + (void) VOP_GETPAGES(vp, raoffset, NULL, &racount, 0, + VM_PROT_READ, 0, 0); + simple_lock(&uobj->vmobjlock); + + racount = 1 << (16 - PAGE_SHIFT); + (void) VOP_GETPAGES(vp, raoffset + 0x10000, NULL, &racount, 0, + VM_PROT_READ, 0, 0); + simple_lock(&uobj->vmobjlock); + } + + /* + * we're almost done! release the pages... + * for errors, we free the pages. + * otherwise we activate them and mark them as valid and clean. + * also, unbusy pages that were not actually requested. + */ + +out: + if (error) { + uvm_lock_pageq(); + for (i = 0; i < npages; i++) { + if (pgs[i] == NULL) { + continue; + } + UVMHIST_LOG(ubchist, "examining pg %p flags 0x%x", + pgs[i], pgs[i]->flags, 0,0); + if (pgs[i]->flags & PG_WANTED) { + wakeup(pgs[i]); + } + if (pgs[i]->flags & PG_RELEASED) { + uvm_unlock_pageq(); + (uobj->pgops->pgo_releasepg)(pgs[i], NULL); + uvm_lock_pageq(); + continue; + } + if (pgs[i]->flags & PG_FAKE) { + uvm_pagefree(pgs[i]); + continue; + } + uvm_pageactivate(pgs[i]); + pgs[i]->flags &= ~(PG_WANTED|PG_BUSY); + UVM_PAGE_OWN(pgs[i], NULL); + } + uvm_unlock_pageq(); + simple_unlock(&uobj->vmobjlock); + UVMHIST_LOG(ubchist, "returning error %d", error,0,0,0); + return error; + } + + UVMHIST_LOG(ubchist, "succeeding, npages %d", npages,0,0,0); + uvm_lock_pageq(); + for (i = 0; i < npages; i++) { + if (pgs[i] == NULL) { + continue; + } + UVMHIST_LOG(ubchist, "examining pg %p flags 0x%x", + pgs[i], pgs[i]->flags, 0,0); + if (pgs[i]->flags & PG_FAKE) { + UVMHIST_LOG(ubchist, "unfaking pg %p offset 0x%x", + pgs[i], pgs[i]->offset,0,0); + pgs[i]->flags &= ~(PG_FAKE); + pmap_clear_modify(pgs[i]); + pmap_clear_reference(pgs[i]); + } + if (write) { + pgs[i]->flags &= ~(PG_RDONLY); + } + if (i < ridx || i >= ridx + orignpages || async) { + UVMHIST_LOG(ubchist, "unbusy pg %p offset 0x%x", + pgs[i], pgs[i]->offset,0,0); + if (pgs[i]->flags & PG_WANTED) { + wakeup(pgs[i]); + } + if (pgs[i]->flags & PG_RELEASED) { + uvm_unlock_pageq(); + (uobj->pgops->pgo_releasepg)(pgs[i], NULL); + uvm_lock_pageq(); + continue; + } + uvm_pageactivate(pgs[i]); + pgs[i]->flags &= ~(PG_WANTED|PG_BUSY); + UVM_PAGE_OWN(pgs[i], NULL); + } + } + uvm_unlock_pageq(); + simple_unlock(&uobj->vmobjlock); + if (ap->a_m != NULL) { + memcpy(ap->a_m, &pgs[ridx], + orignpages * sizeof(struct vm_page *)); + } + return 0; +} + +/* + * generic VM putpages routine. + * Write the given range of pages to backing store. + */ + +int +genfs_putpages(v) + void *v; +{ + struct vop_putpages_args /* { + struct vnode *a_vp; + struct vm_page **a_m; + int a_count; + int a_flags; + int *a_rtvals; + } */ *ap = v; + + int s, error, npages, run; + int fs_bshift, dev_bshift, dev_bsize; + vaddr_t kva; + off_t eof, offset, startoffset; + size_t bytes, iobytes, skipbytes; + daddr_t lbn, blkno; + struct vm_page *pg; + struct buf *mbp, *bp; + struct vnode *vp = ap->a_vp; + boolean_t async = (ap->a_flags & PGO_SYNCIO) == 0; + UVMHIST_FUNC("genfs_putpages"); UVMHIST_CALLED(ubchist); + UVMHIST_LOG(ubchist, "vp %p offset 0x%x count %d", + vp, ap->a_m[0]->offset, ap->a_count, 0); + + simple_unlock(&vp->v_uvm.u_obj.vmobjlock); + + error = VOP_SIZE(vp, vp->v_uvm.u_size, &eof); + if (error) { + return error; + } + + error = 0; + npages = ap->a_count; + fs_bshift = vp->v_mount->mnt_fs_bshift; + dev_bshift = vp->v_mount->mnt_dev_bshift; + dev_bsize = 1 << dev_bshift; + KASSERT((eof & (dev_bsize - 1)) == 0); + + pg = ap->a_m[0]; + startoffset = pg->offset; + bytes = MIN(npages << PAGE_SHIFT, eof - startoffset); + skipbytes = 0; + KASSERT(bytes != 0); + + kva = uvm_pagermapin(ap->a_m, npages, UVMPAGER_MAPIN_WAITOK); + + s = splbio(); + vp->v_numoutput += 2; + mbp = pool_get(&bufpool, PR_WAITOK); + UVMHIST_LOG(ubchist, "vp %p mbp %p num now %d bytes 0x%x", + vp, mbp, vp->v_numoutput, bytes); + splx(s); + mbp->b_bufsize = npages << PAGE_SHIFT; + mbp->b_data = (void *)kva; + mbp->b_resid = mbp->b_bcount = bytes; + mbp->b_flags = B_BUSY|B_WRITE|B_AGE | + (async ? B_CALL : 0) | + (curproc == uvm.pagedaemon_proc ? B_PDAEMON : 0); + mbp->b_iodone = uvm_aio_biodone; + mbp->b_vp = vp; + LIST_INIT(&mbp->b_dep); + + bp = NULL; + for (offset = startoffset; + bytes > 0; + offset += iobytes, bytes -= iobytes) { + lbn = offset >> fs_bshift; + error = VOP_BMAP(vp, lbn, NULL, &blkno, &run); + if (error) { + UVMHIST_LOG(ubchist, "VOP_BMAP() -> %d", error,0,0,0); + skipbytes += bytes; + bytes = 0; + break; + } + + iobytes = MIN((((off_t)lbn + 1 + run) << fs_bshift) - offset, + bytes); + if (blkno == (daddr_t)-1) { + skipbytes += iobytes; + continue; + } + + /* if it's really one i/o, don't make a second buf */ + if (offset == startoffset && iobytes == bytes) { + bp = mbp; + } else { + s = splbio(); + vp->v_numoutput++; + bp = pool_get(&bufpool, PR_WAITOK); + UVMHIST_LOG(ubchist, "vp %p bp %p num now %d", + vp, bp, vp->v_numoutput, 0); + splx(s); + bp->b_data = (char *)kva + + (vaddr_t)(offset - pg->offset); + bp->b_resid = bp->b_bcount = iobytes; + bp->b_flags = B_BUSY|B_WRITE|B_CALL|B_ASYNC; + bp->b_iodone = uvm_aio_biodone1; + bp->b_vp = vp; + LIST_INIT(&bp->b_dep); + } + bp->b_lblkno = 0; + bp->b_private = mbp; + + /* adjust physical blkno for partial blocks */ + bp->b_blkno = blkno + ((offset - ((off_t)lbn << fs_bshift)) >> + dev_bshift); + UVMHIST_LOG(ubchist, "vp %p offset 0x%x bcount 0x%x blkno 0x%x", + vp, offset, bp->b_bcount, bp->b_blkno); + VOP_STRATEGY(bp); + } + if (skipbytes) { + UVMHIST_LOG(ubchist, "skipbytes %d", skipbytes, 0,0,0); + s = splbio(); + mbp->b_resid -= skipbytes; + if (error) { + mbp->b_flags |= B_ERROR; + mbp->b_error = error; + } + if (mbp->b_resid == 0) { + biodone(mbp); + } + splx(s); + } + if (async) { + UVMHIST_LOG(ubchist, "returning PEND", 0,0,0,0); + return EINPROGRESS; + } + if (bp != NULL) { + UVMHIST_LOG(ubchist, "waiting for mbp %p", mbp,0,0,0); + error = biowait(mbp); + } + if (bioops.io_pageiodone) { + (*bioops.io_pageiodone)(mbp); + } + s = splbio(); + if (mbp->b_vp) + vwakeup(mbp->b_vp); + pool_put(&bufpool, mbp); + splx(s); + uvm_pagermapout(kva, npages); + UVMHIST_LOG(ubchist, "returning, error %d", error,0,0,0); + return error; +} + +int +genfs_size(v) + void *v; +{ + struct vop_size_args /* { + struct vnode *a_vp; + off_t a_size; + off_t *a_eobp; + } */ *ap = v; + int bsize; + + bsize = 1 << ap->a_vp->v_mount->mnt_fs_bshift; + *ap->a_eobp = (ap->a_size + bsize - 1) & ~(bsize - 1); + return 0; +} diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index e4efaff930f..7f668a7edde 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -1,4 +1,4 @@ -/* $OpenBSD: vfs_subr.c,v 1.72 2001/11/21 21:13:34 csapuntz Exp $ */ +/* $OpenBSD: vfs_subr.c,v 1.73 2001/11/27 05:27:12 art Exp $ */ /* $NetBSD: vfs_subr.c,v 1.53 1996/04/22 01:39:13 christos Exp $ */ /* @@ -377,6 +377,8 @@ getnewvnode(tag, mp, vops, vpp) int (**vops) __P((void *)); struct vnode **vpp; { + extern struct uvm_pagerops uvm_vnodeops; + struct uvm_object *uobj; struct proc *p = curproc; /* XXX */ struct freelst *listhd; static int toggle; @@ -410,7 +412,7 @@ getnewvnode(tag, mp, vops, vpp) splx(s); simple_unlock(&vnode_free_list_slock); vp = pool_get(&vnode_pool, PR_WAITOK); - bzero((char *)vp, sizeof *vp); + bzero(vp, sizeof *vp); numvnodes++; } else { for (vp = TAILQ_FIRST(listhd); vp != NULLVP; @@ -423,7 +425,7 @@ getnewvnode(tag, mp, vops, vpp) * the first NCPUS items on the free list are * locked, so this is close enough to being empty. */ - if (vp == NULLVP) { + if (vp == NULL) { splx(s); simple_unlock(&vnode_free_list_slock); tablefull("vnode"); @@ -458,6 +460,7 @@ getnewvnode(tag, mp, vops, vpp) vp->v_socket = 0; } vp->v_type = VNON; + lockinit(&vp->v_glock, PVFS, "glock", 0, 0); cache_purge(vp); vp->v_tag = tag; vp->v_op = vops; @@ -466,6 +469,16 @@ getnewvnode(tag, mp, vops, vpp) vp->v_usecount = 1; vp->v_data = 0; simple_lock_init(&vp->v_uvm.u_obj.vmobjlock); + + /* + * initialize uvm_object within vnode. + */ + + uobj = &vp->v_uvm.u_obj; + uobj->pgops = &uvm_vnodeops; + TAILQ_INIT(&uobj->memq); + vp->v_uvm.u_size = VSIZENOTSET; + return (0); } @@ -669,6 +682,10 @@ vget(vp, flags, p) flags |= LK_INTERLOCK; } if (vp->v_flag & VXLOCK) { + if (flags & LK_NOWAIT) { + simple_unlock(&vp->v_interlock); + return (EBUSY); + } vp->v_flag |= VXWANT; simple_unlock(&vp->v_interlock); tsleep((caddr_t)vp, PINOD, "vget", 0); @@ -787,6 +804,11 @@ vput(vp) #endif vputonfreelist(vp); + if (vp->v_flag & VTEXT) { + uvmexp.vtextpages -= vp->v_uvm.u_obj.uo_npages; + uvmexp.vnodepages += vp->v_uvm.u_obj.uo_npages; + } + vp->v_flag &= ~VTEXT; simple_unlock(&vp->v_interlock); VOP_INACTIVE(vp, p); @@ -827,6 +849,11 @@ vrele(vp) #endif vputonfreelist(vp); + if (vp->v_flag & VTEXT) { + uvmexp.vtextpages -= vp->v_uvm.u_obj.uo_npages; + uvmexp.vnodepages += vp->v_uvm.u_obj.uo_npages; + } + vp->v_flag &= ~VTEXT; if (vn_lock(vp, LK_EXCLUSIVE|LK_INTERLOCK, p) == 0) VOP_INACTIVE(vp, p); } @@ -1009,6 +1036,12 @@ vclean(vp, flags, p) if (vp->v_flag & VXLOCK) panic("vclean: deadlock"); vp->v_flag |= VXLOCK; + if (vp->v_flag & VTEXT) { + uvmexp.vtextpages -= vp->v_uvm.u_obj.uo_npages; + uvmexp.vnodepages += vp->v_uvm.u_obj.uo_npages; + } + vp->v_flag &= ~VTEXT; + /* * Even if the count is zero, the VOP_INACTIVE routine may still * have the object locked while it cleans it out. The VOP_LOCK @@ -1019,11 +1052,7 @@ vclean(vp, flags, p) VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p); /* - * clean out any VM data associated with the vnode. - */ - uvm_vnp_terminate(vp); - /* - * Clean out any buffers associated with the vnode. + * Clean out any cached data associated with the vnode. */ if (flags & DOCLOSE) vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0); @@ -1968,9 +1997,22 @@ vinvalbuf(vp, flags, cred, p, slpflag, slptimeo) struct proc *p; int slpflag, slptimeo; { - register struct buf *bp; + struct uvm_object *uobj = &vp->v_uvm.u_obj; + struct buf *bp; struct buf *nbp, *blist; - int s, error; + int s, error, rv; + int flushflags = PGO_ALLPAGES|PGO_FREE|PGO_SYNCIO| + (flags & V_SAVE ? PGO_CLEANIT : 0); + + /* XXXUBC this doesn't look at flags or slp* */ + if (vp->v_type == VREG) { + simple_lock(&uobj->vmobjlock); + rv = (uobj->pgops->pgo_flush)(uobj, 0, 0, flushflags); + simple_unlock(&uobj->vmobjlock); + if (!rv) { + return EIO; + } + } if (flags & V_SAVE) { s = splbio(); @@ -2040,12 +2082,21 @@ loop: void vflushbuf(vp, sync) - register struct vnode *vp; + struct vnode *vp; int sync; { - register struct buf *bp, *nbp; + struct uvm_object *uobj = &vp->v_uvm.u_obj; + struct buf *bp, *nbp; int s; + if (vp->v_type == VREG) { + int flags = PGO_CLEANIT|PGO_ALLPAGES| (sync ? PGO_SYNCIO : 0); + + simple_lock(&uobj->vmobjlock); + (uobj->pgops->pgo_flush)(uobj, 0, 0, flags); + simple_unlock(&uobj->vmobjlock); + } + loop: s = splbio(); for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = nbp) { @@ -2112,23 +2163,25 @@ bgetvp(vp, bp) */ void brelvp(bp) - register struct buf *bp; + struct buf *bp; { struct vnode *vp; - if ((vp = bp->b_vp) == (struct vnode *) 0) + if ((vp = bp->b_vp) == NULL) panic("brelvp: NULL"); + /* * Delete from old vnode list, if on one. */ if (bp->b_vnbufs.le_next != NOLIST) bufremvn(bp); - if ((vp->v_bioflag & VBIOONSYNCLIST) && + if (TAILQ_EMPTY(&vp->v_uvm.u_obj.memq) && + (vp->v_bioflag & VBIOONSYNCLIST) && LIST_FIRST(&vp->v_dirtyblkhd) == NULL) { vp->v_bioflag &= ~VBIOONSYNCLIST; LIST_REMOVE(vp, v_synclist); } - bp->b_vp = (struct vnode *) 0; + bp->b_vp = NULL; simple_lock(&vp->v_interlock); #ifdef DIAGNOSTIC @@ -2205,7 +2258,8 @@ reassignbuf(bp) */ if ((bp->b_flags & B_DELWRI) == 0) { listheadp = &vp->v_cleanblkhd; - if ((vp->v_bioflag & VBIOONSYNCLIST) && + if (TAILQ_EMPTY(&vp->v_uvm.u_obj.memq) && + (vp->v_bioflag & VBIOONSYNCLIST) && LIST_FIRST(&vp->v_dirtyblkhd) == NULL) { vp->v_bioflag &= ~VBIOONSYNCLIST; LIST_REMOVE(vp, v_synclist); diff --git a/sys/kern/vfs_sync.c b/sys/kern/vfs_sync.c index 4b07d0f373a..0adeb2f3065 100644 --- a/sys/kern/vfs_sync.c +++ b/sys/kern/vfs_sync.c @@ -1,4 +1,4 @@ -/* $OpenBSD: vfs_sync.c,v 1.20 2001/11/15 06:38:48 art Exp $ */ +/* $OpenBSD: vfs_sync.c,v 1.21 2001/11/27 05:27:12 art Exp $ */ /* * Portions of this code are: @@ -176,15 +176,12 @@ sched_sync(p) VOP_UNLOCK(vp, 0, p); s = splbio(); if (LIST_FIRST(slp) == vp) { - /* - * Note: disk vps can remain on the - * worklist too with no dirty blocks, but - * since sync_fsync() moves it to a different - * slot we are safe. - */ - if (LIST_FIRST(&vp->v_dirtyblkhd) == NULL && - vp->v_type != VBLK) - panic("sched_sync: fsync failed"); +#ifdef DIAGNOSTIC + if (!(vp->v_bioflag & VBIOONSYNCLIST)) { + vprint("vnode", vp); + panic("sched_fsync: on synclist, but no flag"); + } +#endif /* * Put us back on the worklist. The worklist * routine will remove us from our current diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c index 284fad0fbda..5433711decd 100644 --- a/sys/kern/vfs_syscalls.c +++ b/sys/kern/vfs_syscalls.c @@ -1,4 +1,4 @@ -/* $OpenBSD: vfs_syscalls.c,v 1.82 2001/11/06 19:53:20 miod Exp $ */ +/* $OpenBSD: vfs_syscalls.c,v 1.83 2001/11/27 05:27:12 art Exp $ */ /* $NetBSD: vfs_syscalls.c,v 1.71 1996/04/23 10:29:02 mycroft Exp $ */ /* @@ -493,7 +493,6 @@ sys_sync(p, v, retval) if ((mp->mnt_flag & MNT_RDONLY) == 0) { asyncflag = mp->mnt_flag & MNT_ASYNC; mp->mnt_flag &= ~MNT_ASYNC; - uvm_vnp_sync(mp); VFS_SYNC(mp, MNT_NOWAIT, p->p_ucred, p); if (asyncflag) mp->mnt_flag |= MNT_ASYNC; @@ -1064,6 +1063,13 @@ sys_fhopen(p, v, retval) } if ((error = VOP_OPEN(vp, flags, cred, p)) != 0) goto bad; + + if (vp->v_type == VREG && + uvn_attach(vp, flags & FWRITE ? VM_PROT_WRITE : 0) == NULL) { + error = EIO; + goto bad; + } + if (flags & FWRITE) vp->v_writecount++; @@ -1475,8 +1481,6 @@ sys_unlink(p, v, retval) goto out; } - (void)uvm_vnp_uncache(vp); - VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); error = VOP_REMOVE(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd); @@ -2338,7 +2342,6 @@ out: if (fromnd.ni_dvp != tdvp) VOP_LEASE(fromnd.ni_dvp, p, p->p_ucred, LEASE_WRITE); if (tvp) { - (void)uvm_vnp_uncache(tvp); VOP_LEASE(tvp, p, p->p_ucred, LEASE_WRITE); } error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd, diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c index ee5eb0baee2..491db1172fa 100644 --- a/sys/kern/vfs_vnops.c +++ b/sys/kern/vfs_vnops.c @@ -1,4 +1,4 @@ -/* $OpenBSD: vfs_vnops.c,v 1.35 2001/11/15 06:22:30 art Exp $ */ +/* $OpenBSD: vfs_vnops.c,v 1.36 2001/11/27 05:27:12 art Exp $ */ /* $NetBSD: vfs_vnops.c,v 1.20 1996/02/04 02:18:41 christos Exp $ */ /* @@ -165,6 +165,11 @@ vn_open(ndp, fmode, cmode) } if ((error = VOP_OPEN(vp, fmode, cred, p)) != 0) goto bad; + if (vp->v_type == VREG && + uvn_attach(vp, fmode & FWRITE ? VM_PROT_WRITE : 0) == NULL) { + error = EIO; + goto bad; + } if (fmode & FWRITE) vp->v_writecount++; return (0); @@ -197,11 +202,10 @@ vn_writechk(vp) } } /* - * If there's shared text associated with - * the vnode, try to free it up once. If - * we fail, we can't allow writing. + * If the vnode is in use as a process's text, + * we can't allow writing. */ - if ((vp->v_flag & VTEXT) && !uvm_vnp_uncache(vp)) + if (vp->v_flag & VTEXT) return (ETXTBSY); return (0); @@ -214,6 +218,23 @@ void vn_marktext(vp) struct vnode *vp; { + if ((vp->v_flag & VTEXT) == 0) { + uvmexp.vnodepages -= vp->v_uvm.u_obj.uo_npages; + uvmexp.vtextpages += vp->v_uvm.u_obj.uo_npages; +#if 0 + /* + * Doesn't help much because the pager is borked and ubc_flush is + * slow. + */ +#ifdef PMAP_PREFER + /* + * Get rid of any cached reads from this vnode. + * exec can't respect PMAP_PREFER when mapping the text. + */ + ubc_flush(&vp->v_uvm.u_obj, 0, 0); +#endif +#endif + } vp->v_flag |= VTEXT; } diff --git a/sys/kern/vnode_if.c b/sys/kern/vnode_if.c index 1f30d85c507..d2a3d8298bf 100644 --- a/sys/kern/vnode_if.c +++ b/sys/kern/vnode_if.c @@ -3,7 +3,7 @@ * (Modifications made here may easily be lost!) * * Created from the file: - * OpenBSD: vnode_if.src,v 1.11 2001/06/23 02:21:05 csapuntz Exp + * OpenBSD: vnode_if.src,v 1.13 2001/07/26 20:24:47 millert Exp * by the script: * OpenBSD: vnode_if.sh,v 1.8 2001/02/26 17:34:18 art Exp */ @@ -1230,6 +1230,140 @@ int VOP_WHITEOUT(dvp, cnp, flags) return (VCALL(dvp, VOFFSET(vop_whiteout), &a)); } +int vop_ballocn_vp_offsets[] = { + VOPARG_OFFSETOF(struct vop_ballocn_args,a_vp), + VDESC_NO_OFFSET +}; +struct vnodeop_desc vop_ballocn_desc = { + 0, + "vop_ballocn", + 0, + vop_ballocn_vp_offsets, + VDESC_NO_OFFSET, + VOPARG_OFFSETOF(struct vop_ballocn_args, a_cred), + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + NULL, +}; + +int VOP_BALLOCN(vp, offset, length, cred, flags) + struct vnode *vp; + off_t offset; + off_t length; + struct ucred *cred; + int flags; +{ + struct vop_ballocn_args a; + a.a_desc = VDESC(vop_ballocn); + a.a_vp = vp; + a.a_offset = offset; + a.a_length = length; + a.a_cred = cred; + a.a_flags = flags; + return (VCALL(vp, VOFFSET(vop_ballocn), &a)); +} + +int vop_getpages_vp_offsets[] = { + VOPARG_OFFSETOF(struct vop_getpages_args,a_vp), + VDESC_NO_OFFSET +}; +struct vnodeop_desc vop_getpages_desc = { + 0, + "vop_getpages", + 0, + vop_getpages_vp_offsets, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + NULL, +}; + +int VOP_GETPAGES(vp, offset, m, count, centeridx, access_type, advice, flags) + struct vnode *vp; + voff_t offset; + vm_page_t *m; + int *count; + int centeridx; + vm_prot_t access_type; + int advice; + int flags; +{ + struct vop_getpages_args a; + a.a_desc = VDESC(vop_getpages); + a.a_vp = vp; + a.a_offset = offset; + a.a_m = m; + a.a_count = count; + a.a_centeridx = centeridx; + a.a_access_type = access_type; + a.a_advice = advice; + a.a_flags = flags; + return (VCALL(vp, VOFFSET(vop_getpages), &a)); +} + +int vop_putpages_vp_offsets[] = { + VOPARG_OFFSETOF(struct vop_putpages_args,a_vp), + VDESC_NO_OFFSET +}; +struct vnodeop_desc vop_putpages_desc = { + 0, + "vop_putpages", + 0, + vop_putpages_vp_offsets, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + NULL, +}; + +int VOP_PUTPAGES(vp, m, count, flags, rtvals) + struct vnode *vp; + vm_page_t *m; + int count; + int flags; + int *rtvals; +{ + struct vop_putpages_args a; + a.a_desc = VDESC(vop_putpages); + a.a_vp = vp; + a.a_m = m; + a.a_count = count; + a.a_flags = flags; + a.a_rtvals = rtvals; + return (VCALL(vp, VOFFSET(vop_putpages), &a)); +} + +int vop_size_vp_offsets[] = { + VOPARG_OFFSETOF(struct vop_size_args,a_vp), + VDESC_NO_OFFSET +}; +struct vnodeop_desc vop_size_desc = { + 0, + "vop_size", + 0, + vop_size_vp_offsets, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + NULL, +}; + +int VOP_SIZE(vp, size, eobp) + struct vnode *vp; + off_t size; + off_t *eobp; +{ + struct vop_size_args a; + a.a_desc = VDESC(vop_size); + a.a_vp = vp; + a.a_size = size; + a.a_eobp = eobp; + return (VCALL(vp, VOFFSET(vop_size), &a)); +} + /* Special cases: */ int vop_strategy_vp_offsets[] = { @@ -1323,6 +1457,10 @@ struct vnodeop_desc *vfs_op_descs[] = { &vop_advlock_desc, &vop_reallocblks_desc, &vop_whiteout_desc, + &vop_ballocn_desc, + &vop_getpages_desc, + &vop_putpages_desc, + &vop_size_desc, NULL }; diff --git a/sys/kern/vnode_if.src b/sys/kern/vnode_if.src index fdf8e6e4015..1af0f56e276 100644 --- a/sys/kern/vnode_if.src +++ b/sys/kern/vnode_if.src @@ -1,4 +1,4 @@ -# $OpenBSD: vnode_if.src,v 1.13 2001/07/26 20:24:47 millert Exp $ +# $OpenBSD: vnode_if.src,v 1.14 2001/11/27 05:27:12 art Exp $ # $NetBSD: vnode_if.src,v 1.10 1996/05/11 18:26:27 mycroft Exp $ # # Copyright (c) 1992, 1993 @@ -467,3 +467,48 @@ vop_whiteout { #vop_bwrite { # IN struct buf *bp; #}; + +# +#% ballocn vp L L L +# +vop_ballocn { + IN struct vnode *vp; + IN off_t offset; + IN off_t length; + IN struct ucred *cred; + IN int flags; +}; + +# +#% getpages vp L L L +# +vop_getpages { + IN struct vnode *vp; + IN voff_t offset; + IN vm_page_t *m; + IN int *count; + IN int centeridx; + IN vm_prot_t access_type; + IN int advice; + IN int flags; +}; + +# +#% putpages vp L L L +# +vop_putpages { + IN struct vnode *vp; + IN vm_page_t *m; + IN int count; + IN int flags; + IN int *rtvals; +}; + +# +#% size vp = = = +# +vop_size { + IN struct vnode *vp; + IN off_t size; + OUT off_t *eobp; +}; diff --git a/sys/msdosfs/msdosfs_denode.c b/sys/msdosfs/msdosfs_denode.c index eb82f75afe3..f4ab33d5272 100644 --- a/sys/msdosfs/msdosfs_denode.c +++ b/sys/msdosfs/msdosfs_denode.c @@ -1,4 +1,4 @@ -/* $OpenBSD: msdosfs_denode.c,v 1.19 2001/11/06 19:53:20 miod Exp $ */ +/* $OpenBSD: msdosfs_denode.c,v 1.20 2001/11/27 05:27:12 art Exp $ */ /* $NetBSD: msdosfs_denode.c,v 1.23 1997/10/17 11:23:58 ws Exp $ */ /*- @@ -72,6 +72,8 @@ u_long dehash; /* size of hash table - 1 */ #define DEHASH(dev, dcl, doff) (((dev) + (dcl) + (doff) / sizeof(struct direntry)) \ & dehash) +extern int prtactive; + static struct denode *msdosfs_hashget __P((dev_t, u_long, u_long)); static int msdosfs_hashins __P((struct denode *)); static void msdosfs_hashrem __P((struct denode *)); @@ -332,6 +334,7 @@ retry: nvp->v_type = VREG; VREF(ldep->de_devvp); *depp = ldep; + nvp->v_uvm.u_size = ldep->de_FileSize; return (0); } @@ -461,7 +464,7 @@ detrunc(dep, length, flags, cred, p) #endif return (error); } - uvm_vnp_uncache(DETOV(dep)); + /* * is this the right place for it? */ @@ -524,7 +527,7 @@ deextend(dep, length, cred) struct ucred *cred; { struct msdosfsmount *pmp = dep->de_pmp; - u_long count; + u_long count, osize; int error; /* @@ -557,8 +560,12 @@ deextend(dep, length, cred) } } + osize = dep->de_FileSize; dep->de_FileSize = length; + uvm_vnp_setsize(DETOV(dep), (voff_t)dep->de_FileSize); dep->de_flag |= DE_UPDATE|DE_MODIFIED; + uvm_vnp_zerorange(DETOV(dep), (off_t)osize, + (size_t)(dep->de_FileSize - osize)); return (deupdat(dep, 1)); } @@ -593,7 +600,6 @@ msdosfs_reclaim(v) } */ *ap = v; struct vnode *vp = ap->a_vp; struct denode *dep = VTODE(vp); - extern int prtactive; #ifdef MSDOSFS_DEBUG printf("msdosfs_reclaim(): dep %08x, file %s, refcnt %d\n", @@ -634,7 +640,6 @@ msdosfs_inactive(v) struct denode *dep = VTODE(vp); struct proc *p = ap->a_p; int error; - extern int prtactive; #ifdef MSDOSFS_DEBUG printf("msdosfs_inactive(): dep %08x, de_Name[0] %x\n", dep, dep->de_Name[0]); @@ -661,7 +666,9 @@ msdosfs_inactive(v) dep, dep->de_refcnt, vp->v_mount->mnt_flag, MNT_RDONLY); #endif if (dep->de_refcnt <= 0 && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { - error = detrunc(dep, (u_long)0, 0, NOCRED, NULL); + if (dep->de_FileSize != 0) { + error = detrunc(dep, (u_long)0, 0, NOCRED, NULL); + } dep->de_Name[0] = SLOT_DELETED; } deupdat(dep, 0); diff --git a/sys/msdosfs/msdosfs_fat.c b/sys/msdosfs/msdosfs_fat.c index 772bdfb67e9..3576a663cdc 100644 --- a/sys/msdosfs/msdosfs_fat.c +++ b/sys/msdosfs/msdosfs_fat.c @@ -1,4 +1,4 @@ -/* $OpenBSD: msdosfs_fat.c,v 1.8 1999/01/10 21:50:32 art Exp $ */ +/* $OpenBSD: msdosfs_fat.c,v 1.9 2001/11/27 05:27:12 art Exp $ */ /* $NetBSD: msdosfs_fat.c,v 1.26 1997/10/17 11:24:02 ws Exp $ */ /*- @@ -988,8 +988,7 @@ extendfile(dep, count, bpp, ncp, flags) int flags; { int error; - u_long frcn; - u_long cn, got; + u_long frcn = 0, cn, got; struct msdosfsmount *pmp = dep->de_pmp; struct buf *bp; @@ -1060,41 +1059,26 @@ extendfile(dep, count, bpp, ncp, flags) } /* - * Update the "last cluster of the file" entry in the denode's fat - * cache. + * Update the "last cluster of the file" entry in the + * denode's fat cache. */ + fc_setcache(dep, FC_LASTFC, frcn + got - 1, cn + got - 1); - - if (flags & DE_CLEAR) { + if (flags & DE_CLEAR && + (dep->de_Attributes & ATTR_DIRECTORY)) { while (got-- > 0) { - /* - * Get the buf header for the new block of the file. - */ - if (dep->de_Attributes & ATTR_DIRECTORY) - bp = getblk(pmp->pm_devvp, cntobn(pmp, cn++), - pmp->pm_bpcluster, 0, 0); - else { - bp = getblk(DETOV(dep), de_cn2bn(pmp, frcn++), - pmp->pm_bpcluster, 0, 0); - /* - * Do the bmap now, as in msdosfs_write - */ - if (pcbmap(dep, - de_bn2cn(pmp, bp->b_lblkno), - &bp->b_blkno, 0, 0)) - bp->b_blkno = -1; - if (bp->b_blkno == -1) - panic("extendfile: pcbmap"); - } + bp = getblk(pmp->pm_devvp, cntobn(pmp, cn++), + pmp->pm_bpcluster, 0, 0); clrbuf(bp); if (bpp) { *bpp = bp; bpp = NULL; - } else + } else { bdwrite(bp); + } } } } - + return (0); } diff --git a/sys/msdosfs/msdosfs_vfsops.c b/sys/msdosfs/msdosfs_vfsops.c index fec59174189..63175e08754 100644 --- a/sys/msdosfs/msdosfs_vfsops.c +++ b/sys/msdosfs/msdosfs_vfsops.c @@ -1,4 +1,4 @@ -/* $OpenBSD: msdosfs_vfsops.c,v 1.25 2001/11/21 21:37:01 csapuntz Exp $ */ +/* $OpenBSD: msdosfs_vfsops.c,v 1.26 2001/11/27 05:27:12 art Exp $ */ /* $NetBSD: msdosfs_vfsops.c,v 1.48 1997/10/18 02:54:57 briggs Exp $ */ /*- @@ -584,15 +584,9 @@ msdosfs_mountfs(devvp, mp, p, argp) mp->mnt_data = (qaddr_t)pmp; mp->mnt_stat.f_fsid.val[0] = (long)dev; mp->mnt_stat.f_fsid.val[1] = mp->mnt_vfc->vfc_typenum; -#ifdef QUOTA - /* - * If we ever do quotas for DOS filesystems this would be a place - * to fill in the info in the msdosfsmount structure. You dolt, - * quotas on dos filesystems make no sense because files have no - * owners on dos filesystems. of course there is some empty space - * in the directory entry where we could put uid's and gid's. - */ -#endif + mp->mnt_dev_bshift = pmp->pm_bnshift; + mp->mnt_fs_bshift = pmp->pm_cnshift; + devvp->v_specmountpoint = mp; return (0); @@ -720,10 +714,11 @@ msdosfs_sync_vnode(struct vnode *vp, void *arg) struct denode *dep; dep = VTODE(vp); - if (vp->v_type == VNON || - ((dep->de_flag & (DE_ACCESS | DE_CREATE | DE_UPDATE | DE_MODIFIED)) == 0 - && vp->v_dirtyblkhd.lh_first == NULL) || - msa->waitfor == MNT_LAZY) { + if (msa->waitfor == MNT_LAZY || vp->v_type == VNON || + (((dep->de_flag & + (DE_ACCESS | DE_CREATE | DE_UPDATE | DE_MODIFIED)) == 0) && + (LIST_EMPTY(&vp->v_dirtyblkhd) && + vp->v_uvm.u_obj.uo_npages == 0))) { simple_unlock(&vp->v_interlock); return (0); } diff --git a/sys/msdosfs/msdosfs_vnops.c b/sys/msdosfs/msdosfs_vnops.c index 7f1ab384295..1e364039937 100644 --- a/sys/msdosfs/msdosfs_vnops.c +++ b/sys/msdosfs/msdosfs_vnops.c @@ -1,4 +1,4 @@ -/* $OpenBSD: msdosfs_vnops.c,v 1.28 2001/11/06 19:53:20 miod Exp $ */ +/* $OpenBSD: msdosfs_vnops.c,v 1.29 2001/11/27 05:27:12 art Exp $ */ /* $NetBSD: msdosfs_vnops.c,v 1.63 1997/10/17 11:24:19 ws Exp $ */ /*- @@ -413,11 +413,11 @@ msdosfs_read(v) int error = 0; int diff; int blsize; - int isadir; long n; long on; daddr_t lbn; - daddr_t rablock; + void *win; + vsize_t bytelen; struct buf *bp; struct vnode *vp = ap->a_vp; struct denode *dep = VTODE(vp); @@ -432,42 +432,45 @@ msdosfs_read(v) if (uio->uio_offset < 0) return (EINVAL); - isadir = dep->de_Attributes & ATTR_DIRECTORY; + if (vp->v_type == VREG) { + while (uio->uio_resid > 0) { + bytelen = MIN(dep->de_FileSize - uio->uio_offset, + uio->uio_resid); + + if (bytelen == 0) + break; + win = ubc_alloc(&vp->v_uvm.u_obj, uio->uio_offset, + &bytelen, UBC_READ); + error = uiomove(win, bytelen, uio); + ubc_release(win, 0); + if (error) + break; + } + dep->de_flag |= DE_ACCESS; + goto out; + } + + /* this loop is only for directories now */ do { lbn = de_cluster(pmp, uio->uio_offset); on = uio->uio_offset & pmp->pm_crbomask; - n = min((u_long) (pmp->pm_bpcluster - on), uio->uio_resid); + n = MIN((pmp->pm_bpcluster - on), uio->uio_resid); diff = dep->de_FileSize - uio->uio_offset; if (diff <= 0) return (0); if (diff < n) n = diff; /* convert cluster # to block # if a directory */ - if (isadir) { - error = pcbmap(dep, lbn, &lbn, 0, &blsize); - if (error) - return (error); - } + error = pcbmap(dep, lbn, &lbn, 0, &blsize); + if (error) + return (error); /* * If we are operating on a directory file then be sure to * do i/o with the vnode for the filesystem instead of the * vnode for the directory. */ - if (isadir) { - error = bread(pmp->pm_devvp, lbn, blsize, NOCRED, &bp); - } else { - rablock = lbn + 1; - if (dep->de_lastr + 1 == lbn && - de_cn2off(pmp, rablock) < dep->de_FileSize) - error = breada(vp, de_cn2bn(pmp, lbn), - pmp->pm_bpcluster, de_cn2bn(pmp, rablock), - pmp->pm_bpcluster, NOCRED, &bp); - else - error = bread(vp, de_cn2bn(pmp, lbn), - pmp->pm_bpcluster, NOCRED, &bp); - dep->de_lastr = lbn; - } - n = min(n, pmp->pm_bpcluster - bp->b_resid); + error = bread(pmp->pm_devvp, lbn, blsize, NOCRED, &bp); + n = MIN(n, pmp->pm_bpcluster - bp->b_resid); if (error) { brelse(bp); return (error); @@ -475,8 +478,10 @@ msdosfs_read(v) error = uiomove(bp->b_data + on, (int) n, uio); brelse(bp); } while (error == 0 && uio->uio_resid > 0 && n != 0); - if (!isadir && !(vp->v_mount->mnt_flag & MNT_NOATIME)) - dep->de_flag |= DE_ACCESS; + +out: + if ((ap->a_ioflag & IO_SYNC) == IO_SYNC) + error = deupdat(dep, 1); return (error); } @@ -493,19 +498,19 @@ msdosfs_write(v) int a_ioflag; struct ucred *a_cred; } */ *ap = v; - int n; - int croffset; int resid; u_long osize; int error = 0; u_long count; - daddr_t bn, lastcn; - struct buf *bp; + daddr_t lastcn; int ioflag = ap->a_ioflag; + void *win; + vsize_t bytelen; + off_t oldoff; + boolean_t rv; struct uio *uio = ap->a_uio; struct proc *p = uio->uio_procp; struct vnode *vp = ap->a_vp; - struct vnode *thisvp; struct denode *dep = VTODE(vp); struct msdosfsmount *pmp = dep->de_pmp; struct ucred *cred = ap->a_cred; @@ -521,7 +526,6 @@ msdosfs_write(v) case VREG: if (ioflag & IO_APPEND) uio->uio_offset = dep->de_FileSize; - thisvp = vp; break; case VDIR: return EISDIR; @@ -576,84 +580,52 @@ msdosfs_write(v) } else lastcn = de_clcount(pmp, osize) - 1; + if (dep->de_FileSize < uio->uio_offset + resid) { + dep->de_FileSize = uio->uio_offset + resid; + uvm_vnp_setsize(vp, dep->de_FileSize); + } + do { - if (de_cluster(pmp, uio->uio_offset) > lastcn) { + oldoff = uio->uio_offset; + if (de_cluster(pmp, oldoff) > lastcn) { error = ENOSPC; break; } - - bn = de_blk(pmp, uio->uio_offset); - if ((uio->uio_offset & pmp->pm_crbomask) == 0 - && (de_blk(pmp, uio->uio_offset + uio->uio_resid) > de_blk(pmp, uio->uio_offset) - || uio->uio_offset + uio->uio_resid >= dep->de_FileSize)) { - /* - * If either the whole cluster gets written, - * or we write the cluster from its start beyond EOF, - * then no need to read data from disk. - */ - bp = getblk(thisvp, bn, pmp->pm_bpcluster, 0, 0); - clrbuf(bp); - /* - * Do the bmap now, since pcbmap needs buffers - * for the fat table. (see msdosfs_strategy) - */ - if (bp->b_blkno == bp->b_lblkno) { - error = pcbmap(dep, - de_bn2cn(pmp, bp->b_lblkno), - &bp->b_blkno, 0, 0); - if (error) - bp->b_blkno = -1; - } - if (bp->b_blkno == -1) { - brelse(bp); - if (!error) - error = EIO; /* XXX */ - break; - } - } else { - /* - * The block we need to write into exists, so read it in. - */ - error = bread(thisvp, bn, pmp->pm_bpcluster, - NOCRED, &bp); - if (error) { - brelse(bp); - break; - } - } - - croffset = uio->uio_offset & pmp->pm_crbomask; - n = min(uio->uio_resid, pmp->pm_bpcluster - croffset); - if (uio->uio_offset + n > dep->de_FileSize) { - dep->de_FileSize = uio->uio_offset + n; - uvm_vnp_setsize(vp, dep->de_FileSize); - } - uvm_vnp_uncache(vp); - /* - * Should these vnode_pager_* functions be done on dir - * files? - */ + bytelen = MIN(dep->de_FileSize - oldoff, uio->uio_resid); /* - * Copy the data from user space into the buf header. + * XXXUBC if file is mapped and this is the last block, + * process one page at a time. */ - error = uiomove(bp->b_data + croffset, n, uio); + if (bytelen == 0) + break; + win = ubc_alloc(&vp->v_uvm.u_obj, oldoff, &bytelen, UBC_READ); + error = uiomove(win, bytelen, uio); + ubc_release(win, 0); + if (error) { + break; + } /* - * If they want this synchronous then write it and wait for - * it. Otherwise, if on a cluster boundary write it - * asynchronously so we can move on to the next block - * without delay. Otherwise do a delayed write because we - * may want to write somemore into the block later. + * flush what we just wrote if necessary. + * XXXUBC simplistic async flushing. */ - if (ioflag & IO_SYNC) - (void) bwrite(bp); - else if (n + croffset == pmp->pm_bpcluster) - bawrite(bp); - else - bdwrite(bp); - dep->de_flag |= DE_UPDATE; + if (ioflag & IO_SYNC) { + + simple_lock(&vp->v_uvm.u_obj.vmobjlock); + rv = vp->v_uvm.u_obj.pgops->pgo_flush( + &vp->v_uvm.u_obj, oldoff, + oldoff + bytelen, PGO_CLEANIT|PGO_SYNCIO); + simple_unlock(&vp->v_uvm.u_obj.vmobjlock); + } else if (oldoff >> 16 != uio->uio_offset >> 16) { + simple_lock(&vp->v_uvm.u_obj.vmobjlock); + rv = vp->v_uvm.u_obj.pgops->pgo_flush( + &vp->v_uvm.u_obj, (oldoff >> 16) << 16, + (uio->uio_offset >> 16) << 16, PGO_CLEANIT); + simple_unlock(&vp->v_uvm.u_obj.vmobjlock); + } } while (error == 0 && uio->uio_resid > 0); + dep->de_flag |= DE_UPDATE; /* * If the write failed and they want us to, truncate the file back @@ -666,7 +638,8 @@ errexit: uio->uio_offset -= resid - uio->uio_resid; uio->uio_resid = resid; } else { - detrunc(dep, dep->de_FileSize, ioflag & IO_SYNC, NOCRED, NULL); + detrunc(dep, dep->de_FileSize, ioflag & IO_SYNC, NOCRED, + NULL); if (uio->uio_resid != resid) error = 0; } @@ -1506,11 +1479,11 @@ msdosfs_readdir(v) while (uio->uio_resid > 0) { lbn = de_cluster(pmp, offset - bias); on = (offset - bias) & pmp->pm_crbomask; - n = min(pmp->pm_bpcluster - on, uio->uio_resid); + n = MIN(pmp->pm_bpcluster - on, uio->uio_resid); diff = dep->de_FileSize - (offset - bias); if (diff <= 0) break; - n = min(n, diff); + n = MIN(n, diff); if ((error = pcbmap(dep, lbn, &bn, &cn, &blsize)) != 0) break; error = bread(pmp->pm_devvp, bn, blsize, NOCRED, &bp); @@ -1518,7 +1491,7 @@ msdosfs_readdir(v) brelse(bp); return (error); } - n = min(n, blsize - bp->b_resid); + n = MIN(n, blsize - bp->b_resid); /* * Convert from dos directory entries to fs-independent @@ -1779,12 +1752,12 @@ msdosfs_strategy(v) biodone(bp); return (error); } -#ifdef DIAGNOSTIC -#endif + /* * Read/write the block from/to the disk that contains the desired * file block. */ + vp = dep->de_devvp; bp->b_dev = vp->v_rdev; VOCALL(vp->v_op, VOFFSET(vop_strategy), ap); @@ -1902,7 +1875,10 @@ struct vnodeopv_entry_desc msdosfs_vnodeop_entries[] = { { &vop_advlock_desc, msdosfs_advlock }, /* advlock */ { &vop_reallocblks_desc, msdosfs_reallocblks }, /* reallocblks */ { &vop_bwrite_desc, vop_generic_bwrite }, /* bwrite */ - { (struct vnodeop_desc *)NULL, (int (*) __P((void *)))NULL } + { &vop_getpages_desc, genfs_getpages }, + { &vop_putpages_desc, genfs_putpages }, + { &vop_size_desc, genfs_size }, + { NULL, NULL } }; struct vnodeopv_desc msdosfs_vnodeop_opv_desc = { &msdosfs_vnodeop_p, msdosfs_vnodeop_entries }; diff --git a/sys/nfs/nfs.h b/sys/nfs/nfs.h index 33435dc23e4..b86819902f2 100644 --- a/sys/nfs/nfs.h +++ b/sys/nfs/nfs.h @@ -1,4 +1,4 @@ -/* $OpenBSD: nfs.h,v 1.13 2001/09/16 00:42:44 millert Exp $ */ +/* $OpenBSD: nfs.h,v 1.14 2001/11/27 05:27:12 art Exp $ */ /* $NetBSD: nfs.h,v 1.10.4.1 1996/05/27 11:23:56 fvdl Exp $ */ /* @@ -78,8 +78,18 @@ * Ideally, NFS_DIRBLKSIZ should be bigger, but I've seen servers with * broken NFS/ethernet drivers that won't work with anything bigger (Linux..) */ -#define NFS_DIRBLKSIZ 1024 /* Must be a multiple of DIRBLKSIZ */ +#if 1 +/* + * XXXUBC temp hack because of the removal of b_validend. + * eventually we'll store NFS VDIR data in the page cache as well, + * we'll fix this at that point. + */ +#define NFS_DIRBLKSIZ PAGE_SIZE +#define NFS_READDIRBLKSIZ PAGE_SIZE +#else +#define NFS_DIRBLKSIZ 1024 /* Must be a multiple of DIRBLKSIZ */ #define NFS_READDIRBLKSIZ 512 /* Size of read dir blocks. XXX */ +#endif /* * Oddballs @@ -111,10 +121,10 @@ #endif /* - * The B_INVAFTERWRITE flag should be set to whatever is required by the - * buffer cache code to say "Invalidate the block after it is written back". + * Use the vm_page flag reserved for pager use to indicate pages + * which have been written to the server but not yet committed. */ -#define B_INVAFTERWRITE B_INVAL +#define PG_NEEDCOMMIT PG_PAGER1 /* * The IO_METASYNC flag should be implemented for local file systems. diff --git a/sys/nfs/nfs_bio.c b/sys/nfs/nfs_bio.c index 1f33bc2eab7..42b25763a88 100644 --- a/sys/nfs/nfs_bio.c +++ b/sys/nfs/nfs_bio.c @@ -1,4 +1,4 @@ -/* $OpenBSD: nfs_bio.c,v 1.24 2001/11/15 23:15:15 art Exp $ */ +/* $OpenBSD: nfs_bio.c,v 1.25 2001/11/27 05:27:12 art Exp $ */ /* $NetBSD: nfs_bio.c,v 1.25.4.2 1996/07/08 20:47:04 jtc Exp $ */ /* @@ -50,8 +50,9 @@ #include <sys/mount.h> #include <sys/kernel.h> #include <sys/namei.h> +#include <sys/pool.h> -#include <uvm/uvm_extern.h> +#include <uvm/uvm.h> #include <nfs/rpcv2.h> #include <nfs/nfsproto.h> @@ -70,20 +71,19 @@ struct nfsstats nfsstats; */ int nfs_bioread(vp, uio, ioflag, cred) - register struct vnode *vp; - register struct uio *uio; + struct vnode *vp; + struct uio *uio; int ioflag; struct ucred *cred; { - register struct nfsnode *np = VTONFS(vp); - register int biosize, diff; - struct buf *bp = NULL, *rabp; + struct nfsnode *np = VTONFS(vp); + int biosize; + struct buf *bp = NULL; struct vattr vattr; struct proc *p; struct nfsmount *nmp = VFSTONFS(vp->v_mount); - daddr_t lbn, bn, rabn; caddr_t baddr; - int got_buf = 0, nra, error = 0, n = 0, on = 0, not_readin; + int got_buf = 0, error = 0, n = 0, on = 0; #ifdef DIAGNOSTIC if (uio->uio_rw != UIO_READ) @@ -153,87 +153,25 @@ nfs_bioread(vp, uio, ioflag, cred) switch (vp->v_type) { case VREG: nfsstats.biocache_reads++; - lbn = uio->uio_offset / biosize; - on = uio->uio_offset & (biosize - 1); - bn = lbn * (biosize / DEV_BSIZE); - not_readin = 1; - - /* - * Start the read ahead(s), as required. - */ - if (nfs_numasync > 0 && nmp->nm_readahead > 0) { - for (nra = 0; nra < nmp->nm_readahead && - (lbn + 1 + nra) * biosize < np->n_size; nra++) { - rabn = (lbn + 1 + nra) * (biosize / DEV_BSIZE); - if (!incore(vp, rabn)) { - rabp = nfs_getcacheblk(vp, rabn, biosize, p); - if (!rabp) - return (EINTR); - if ((rabp->b_flags & (B_DELWRI | B_DONE)) == 0) { - rabp->b_flags |= (B_READ | B_ASYNC); - if (nfs_asyncio(rabp)) { - rabp->b_flags |= B_INVAL; - brelse(rabp); - } - } else - brelse(rabp); - } - } - } + error = 0; + while (uio->uio_resid > 0) { + void *win; + vsize_t bytelen = MIN(np->n_size - uio->uio_offset, + uio->uio_resid); - /* - * If the block is in the cache and has the required data - * in a valid region, just copy it out. - * Otherwise, get the block and write back/read in, - * as required. - */ - if ((bp = incore(vp, bn)) && - (bp->b_flags & (B_BUSY | B_WRITEINPROG)) == - (B_BUSY | B_WRITEINPROG)) - got_buf = 0; - else { -again: - bp = nfs_getcacheblk(vp, bn, biosize, p); - if (!bp) - return (EINTR); - got_buf = 1; - if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0) { - bp->b_flags |= B_READ; - not_readin = 0; - error = nfs_doio(bp, p); - if (error) { - brelse(bp); - return (error); - } - } - } - n = min((unsigned)(biosize - on), uio->uio_resid); - diff = np->n_size - uio->uio_offset; - if (diff < n) - n = diff; - if (not_readin && n > 0) { - if (on < bp->b_validoff || (on + n) > bp->b_validend) { - if (!got_buf) { - bp = nfs_getcacheblk(vp, bn, biosize, p); - if (!bp) - return (EINTR); - got_buf = 1; - } - bp->b_flags |= B_INVAFTERWRITE; - if (bp->b_dirtyend > 0) { - if ((bp->b_flags & B_DELWRI) == 0) - panic("nfsbioread"); - if (VOP_BWRITE(bp) == EINTR) - return (EINTR); - } else - brelse(bp); - goto again; + if (bytelen == 0) + break; + win = ubc_alloc(&vp->v_uvm.u_obj, uio->uio_offset, + &bytelen, UBC_READ); + error = uiomove(win, bytelen, uio); + ubc_release(win, 0); + if (error) { + break; } } - diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on); - if (diff < n) - n = diff; + n = 0; break; + case VLNK: nfsstats.biocache_readlinks++; bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p); @@ -247,7 +185,7 @@ again: return (error); } } - n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid); + n = MIN(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid); got_buf = 1; on = 0; break; @@ -289,18 +227,17 @@ nfs_write(v) int a_ioflag; struct ucred *a_cred; } */ *ap = v; - register int biosize; - register struct uio *uio = ap->a_uio; + int biosize; + struct uio *uio = ap->a_uio; struct proc *p = uio->uio_procp; - register struct vnode *vp = ap->a_vp; + struct vnode *vp = ap->a_vp; struct nfsnode *np = VTONFS(vp); - register struct ucred *cred = ap->a_cred; + struct ucred *cred = ap->a_cred; int ioflag = ap->a_ioflag; - struct buf *bp; struct vattr vattr; struct nfsmount *nmp = VFSTONFS(vp->v_mount); - daddr_t lbn, bn; - int n, on, error = 0; + int error = 0; + int rv; #ifdef DIAGNOSTIC if (uio->uio_rw != UIO_WRITE) @@ -360,85 +297,47 @@ nfs_write(v) */ biosize = nmp->nm_rsize; do { - - /* - * XXX make sure we aren't cached in the VM page cache - */ - uvm_vnp_uncache(vp); + void *win; + voff_t oldoff = uio->uio_offset; + vsize_t bytelen = uio->uio_resid; nfsstats.biocache_writes++; - lbn = uio->uio_offset / biosize; - on = uio->uio_offset & (biosize-1); - n = min((unsigned)(biosize - on), uio->uio_resid); - bn = lbn * (biosize / DEV_BSIZE); -again: - bp = nfs_getcacheblk(vp, bn, biosize, p); - if (!bp) - return (EINTR); np->n_flag |= NMODIFIED; - if (uio->uio_offset + n > np->n_size) { - np->n_size = uio->uio_offset + n; - uvm_vnp_setsize(vp, (u_long)np->n_size); - } - - /* - * If the new write will leave a contiguous dirty - * area, just update the b_dirtyoff and b_dirtyend, - * otherwise force a write rpc of the old dirty area. - */ - if (bp->b_dirtyend > 0 && - (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) { - bp->b_proc = p; - if (VOP_BWRITE(bp) == EINTR) - return (EINTR); - goto again; - } - - error = uiomove((char *)bp->b_data + on, n, uio); - if (error) { - bp->b_flags |= B_ERROR; - brelse(bp); - return (error); + if (np->n_size < uio->uio_offset + bytelen) { + np->n_size = uio->uio_offset + bytelen; + uvm_vnp_setsize(vp, np->n_size); } - if (bp->b_dirtyend > 0) { - bp->b_dirtyoff = min(on, bp->b_dirtyoff); - bp->b_dirtyend = max((on + n), bp->b_dirtyend); - } else { - bp->b_dirtyoff = on; - bp->b_dirtyend = on + n; + win = ubc_alloc(&vp->v_uvm.u_obj, uio->uio_offset, &bytelen, + UBC_WRITE); + error = uiomove(win, bytelen, uio); + ubc_release(win, 0); + rv = 1; + if ((ioflag & IO_SYNC)) { + simple_lock(&vp->v_uvm.u_obj.vmobjlock); + rv = vp->v_uvm.u_obj.pgops->pgo_flush( + &vp->v_uvm.u_obj, + oldoff & ~(nmp->nm_wsize - 1), + uio->uio_offset & ~(nmp->nm_wsize - 1), + PGO_CLEANIT|PGO_SYNCIO); + simple_unlock(&vp->v_uvm.u_obj.vmobjlock); + } else if ((oldoff & ~(nmp->nm_wsize - 1)) != + (uio->uio_offset & ~(nmp->nm_wsize - 1))) { + simple_lock(&vp->v_uvm.u_obj.vmobjlock); + rv = vp->v_uvm.u_obj.pgops->pgo_flush( + &vp->v_uvm.u_obj, + oldoff & ~(nmp->nm_wsize - 1), + uio->uio_offset & ~(nmp->nm_wsize - 1), + PGO_CLEANIT|PGO_WEAK); + simple_unlock(&vp->v_uvm.u_obj.vmobjlock); } - if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff || - bp->b_validoff > bp->b_dirtyend) { - bp->b_validoff = bp->b_dirtyoff; - bp->b_validend = bp->b_dirtyend; - } else { - bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff); - bp->b_validend = max(bp->b_validend, bp->b_dirtyend); + if (!rv) { + error = EIO; } - - /* - * Since this block is being modified, it must be written - * again and not just committed. - */ - bp->b_flags &= ~B_NEEDCOMMIT; - - /* - * If the lease is non-cachable or IO_SYNC do bwrite(). - */ - if (ioflag & IO_SYNC) { - bp->b_proc = p; - error = VOP_BWRITE(bp); - if (error) - return (error); - } else if ((n + on) == biosize) { - bp->b_proc = (struct proc *)0; - bp->b_flags |= B_ASYNC; - (void)nfs_writebp(bp, 0); - } else { - bdwrite(bp); + if (error) { + break; } - } while (uio->uio_resid > 0 && n > 0); - return (0); + } while (uio->uio_resid > 0); + return (error); } /* @@ -460,9 +359,9 @@ nfs_getcacheblk(vp, bn, size, p) if (nmp->nm_flag & NFSMNT_INT) { bp = getblk(vp, bn, size, PCATCH, 0); - while (bp == (struct buf *)0) { - if (nfs_sigintr(nmp, (struct nfsreq *)0, p)) - return ((struct buf *)0); + while (bp == NULL) { + if (nfs_sigintr(nmp, NULL, p)) + return (NULL); bp = getblk(vp, bn, size, 0, 2 * hz); } } else @@ -502,7 +401,7 @@ nfs_vinvalbuf(vp, flags, cred, p, intrflg) np->n_flag |= NFLUSHWANT; error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval", slptimeo); - if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) + if (error && intrflg && nfs_sigintr(nmp, NULL, p)) return (EINTR); } @@ -512,7 +411,7 @@ nfs_vinvalbuf(vp, flags, cred, p, intrflg) np->n_flag |= NFLUSHINPROG; error = vinvalbuf(vp, flags, cred, p, slpflag, 0); while (error) { - if (intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) { + if (intrflg && nfs_sigintr(nmp, NULL, p)) { np->n_flag &= ~NFLUSHINPROG; if (np->n_flag & NFLUSHWANT) { np->n_flag &= ~NFLUSHWANT; @@ -539,41 +438,20 @@ int nfs_asyncio(bp) struct buf *bp; { - int i,s; + int i; if (nfs_numasync == 0) return (EIO); - for (i = 0; i < NFS_MAXASYNCDAEMON; i++) + for (i = 0; i < NFS_MAXASYNCDAEMON; i++) { if (nfs_iodwant[i]) { - if ((bp->b_flags & B_READ) == 0) { - bp->b_flags |= B_WRITEINPROG; - } - TAILQ_INSERT_TAIL(&nfs_bufq, bp, b_freelist); - nfs_iodwant[i] = (struct proc *)0; + nfs_iodwant[i] = NULL; wakeup((caddr_t)&nfs_iodwant[i]); return (0); } + } - /* - * If it is a read or a write already marked B_WRITEINPROG or B_NOCACHE - * return EIO so the process will call nfs_doio() and do it - * synchronously. - */ - if (bp->b_flags & (B_READ | B_WRITEINPROG | B_NOCACHE)) - return (EIO); - - /* - * Just turn the async write into a delayed write, instead of - * doing in synchronously. Hopefully, at least one of the nfsiods - * is currently doing a write for this file and will pick up the - * delayed writes before going back to sleep. - */ - s = splbio(); - buf_dirty(bp); - splx(s); - biodone(bp); - return (0); + return (EIO); } /* @@ -589,7 +467,7 @@ nfs_doio(bp, p) register struct vnode *vp; struct nfsnode *np; struct nfsmount *nmp; - int s, error = 0, diff, len, iomode, must_commit = 0; + int error = 0, diff, len, iomode, must_commit = 0; struct uio uio; struct iovec io; @@ -636,9 +514,7 @@ nfs_doio(bp, p) uiop->uio_offset = ((off_t)bp->b_blkno) << DEV_BSHIFT; nfsstats.read_bios++; error = nfs_readrpc(vp, uiop); - if (!error) { - bp->b_validoff = 0; - if (uiop->uio_resid) { + if (!error && uiop->uio_resid) { /* * If len > 0, there is a hole in the file and * no writes after the hole have been pushed to @@ -649,13 +525,9 @@ nfs_doio(bp, p) len = np->n_size - ((((off_t)bp->b_blkno) << DEV_BSHIFT) + diff); if (len > 0) { - len = min(len, uiop->uio_resid); - bzero((char *)bp->b_data + diff, len); - bp->b_validend = diff + len; - } else - bp->b_validend = diff; - } else - bp->b_validend = bp->b_bcount; + len = MIN(len, uiop->uio_resid); + memset((char *)bp->b_data + diff, 0, len); + } } if (p && (vp->v_flag & VTEXT) && (np->n_mtime != np->n_vattr.va_mtime.tv_sec)) { @@ -672,62 +544,19 @@ nfs_doio(bp, p) default: printf("nfs_doio: type %x unexpected\n",vp->v_type); break; - }; + } if (error) { bp->b_flags |= B_ERROR; bp->b_error = error; } } else { - io.iov_len = uiop->uio_resid = bp->b_dirtyend - - bp->b_dirtyoff; - uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE - + bp->b_dirtyoff; - io.iov_base = (char *)bp->b_data + bp->b_dirtyoff; + io.iov_base = bp->b_data; + io.iov_len = uiop->uio_resid = bp->b_bcount; + uiop->uio_offset = ((off_t)bp->b_blkno) << DEV_BSHIFT; uiop->uio_rw = UIO_WRITE; nfsstats.write_bios++; - if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE)) == B_ASYNC) - iomode = NFSV3WRITE_UNSTABLE; - else - iomode = NFSV3WRITE_FILESYNC; - bp->b_flags |= B_WRITEINPROG; -#ifdef fvdl_debug - printf("nfs_doio(%x): bp %x doff %d dend %d\n", - vp, bp, bp->b_dirtyoff, bp->b_dirtyend); -#endif + iomode = NFSV3WRITE_UNSTABLE; error = nfs_writerpc(vp, uiop, &iomode, &must_commit); - if (!error && iomode == NFSV3WRITE_UNSTABLE) - bp->b_flags |= B_NEEDCOMMIT; - else - bp->b_flags &= ~B_NEEDCOMMIT; - bp->b_flags &= ~B_WRITEINPROG; - - /* - * For an interrupted write, the buffer is still valid and the - * write hasn't been pushed to the server yet, so we can't set - * B_ERROR and report the interruption by setting B_EINTR. For - * the B_ASYNC case, B_EINTR is not relevant, so the rpc attempt - * is essentially a noop. - * For the case of a V3 write rpc not being committed to stable - * storage, the block is still dirty and requires either a commit - * rpc or another write rpc with iomode == NFSV3WRITE_FILESYNC - * before the block is reused. This is indicated by setting the - * B_DELWRI and B_NEEDCOMMIT flags. - */ - if (error == EINTR || (!error && (bp->b_flags & B_NEEDCOMMIT))) { - s = splbio(); - buf_dirty(bp); - splx(s); - - if (!(bp->b_flags & B_ASYNC) && error) - bp->b_flags |= B_EINTR; - } else { - if (error) { - bp->b_flags |= B_ERROR; - bp->b_error = np->n_error = error; - np->n_flag |= NWRITEERR; - } - bp->b_dirtyoff = bp->b_dirtyend = 0; - } } bp->b_resid = uiop->uio_resid; if (must_commit) @@ -735,3 +564,590 @@ nfs_doio(bp, p) biodone(bp); return (error); } + +/* + * Vnode op for VM getpages. + */ +int +nfs_getpages(v) + void *v; +{ + struct vop_getpages_args /* { + struct vnode *a_vp; + voff_t a_offset; + vm_page_t *a_m; + int *a_count; + int a_centeridx; + vm_prot_t a_access_type; + int a_advice; + int a_flags; + } */ *ap = v; + + off_t eof, offset, origoffset, startoffset, endoffset; + int s, i, error, npages, orignpages, npgs, ridx, pidx, pcount; + vaddr_t kva; + struct buf *bp, *mbp; + struct vnode *vp = ap->a_vp; + struct nfsnode *np = VTONFS(vp); + struct uvm_object *uobj = &vp->v_uvm.u_obj; + struct nfsmount *nmp = VFSTONFS(vp->v_mount); + size_t bytes, iobytes, tailbytes, totalbytes, skipbytes; + int flags = ap->a_flags; + int bsize; + struct vm_page *pgs[16]; /* XXXUBC 16 */ + boolean_t v3 = NFS_ISV3(vp); + boolean_t async = (flags & PGO_SYNCIO) == 0; + boolean_t write = (ap->a_access_type & VM_PROT_WRITE) != 0; + struct proc *p = curproc; + + UVMHIST_FUNC("nfs_getpages"); UVMHIST_CALLED(ubchist); + UVMHIST_LOG(ubchist, "vp %p off 0x%x count %d", vp, (int)ap->a_offset, + *ap->a_count,0); + +#ifdef DIAGNOSTIC + if (ap->a_centeridx < 0 || ap->a_centeridx >= *ap->a_count) { + panic("nfs_getpages: centeridx %d out of range", + ap->a_centeridx); + } +#endif + + error = 0; + origoffset = ap->a_offset; + eof = vp->v_uvm.u_size; + if (origoffset >= eof) { + if ((flags & PGO_LOCKED) == 0) { + simple_unlock(&uobj->vmobjlock); + } + UVMHIST_LOG(ubchist, "off 0x%x past EOF 0x%x", + (int)origoffset, (int)eof,0,0); + return EINVAL; + } + + if (flags & PGO_LOCKED) { + uvn_findpages(uobj, origoffset, ap->a_count, ap->a_m, + UFP_NOWAIT|UFP_NOALLOC); + return 0; + } + + /* vnode is VOP_LOCKed, uobj is locked */ + + bsize = nmp->nm_rsize; + orignpages = MIN(*ap->a_count, + round_page(eof - origoffset) >> PAGE_SHIFT); + npages = orignpages; + startoffset = origoffset & ~(bsize - 1); + endoffset = round_page((origoffset + (npages << PAGE_SHIFT) + + bsize - 1) & ~(bsize - 1)); + endoffset = MIN(endoffset, round_page(eof)); + ridx = (origoffset - startoffset) >> PAGE_SHIFT; + + if (!async && !write) { + int rapages = MAX(PAGE_SIZE, nmp->nm_rsize) >> PAGE_SHIFT; + + (void) VOP_GETPAGES(vp, endoffset, NULL, &rapages, 0, + VM_PROT_READ, 0, 0); + simple_lock(&uobj->vmobjlock); + } + + UVMHIST_LOG(ubchist, "npages %d offset 0x%x", npages, + (int)origoffset, 0,0); + memset(pgs, 0, sizeof(pgs)); + uvn_findpages(uobj, origoffset, &npages, &pgs[ridx], UFP_ALL); + + if (flags & PGO_OVERWRITE) { + UVMHIST_LOG(ubchist, "PGO_OVERWRITE",0,0,0,0); + + /* XXXUBC for now, zero the page if we allocated it */ + for (i = 0; i < npages; i++) { + struct vm_page *pg = pgs[ridx + i]; + + if (pg->flags & PG_FAKE) { + uvm_pagezero(pg); + pg->flags &= ~(PG_FAKE); + } + } + npages += ridx; + if (v3) { + simple_unlock(&uobj->vmobjlock); + goto uncommit; + } + goto out; + } + + /* + * if the pages are already resident, just return them. + */ + + for (i = 0; i < npages; i++) { + struct vm_page *pg = pgs[ridx + i]; + + if ((pg->flags & PG_FAKE) != 0 || + ((ap->a_access_type & VM_PROT_WRITE) && + (pg->flags & PG_RDONLY))) { + break; + } + } + if (i == npages) { + UVMHIST_LOG(ubchist, "returning cached pages", 0,0,0,0); + npages += ridx; + goto out; + } + + /* + * the page wasn't resident and we're not overwriting, + * so we're going to have to do some i/o. + * find any additional pages needed to cover the expanded range. + */ + + if (startoffset != origoffset || + startoffset + (npages << PAGE_SHIFT) != endoffset) { + + /* + * XXXUBC we need to avoid deadlocks caused by locking + * additional pages at lower offsets than pages we + * already have locked. for now, unlock them all and + * start over. + */ + + for (i = 0; i < npages; i++) { + struct vm_page *pg = pgs[ridx + i]; + + if (pg->flags & PG_FAKE) { + pg->flags |= PG_RELEASED; + } + } + uvm_page_unbusy(&pgs[ridx], npages); + memset(pgs, 0, sizeof(pgs)); + + UVMHIST_LOG(ubchist, "reset npages start 0x%x end 0x%x", + startoffset, endoffset, 0,0); + npages = (endoffset - startoffset) >> PAGE_SHIFT; + npgs = npages; + uvn_findpages(uobj, startoffset, &npgs, pgs, UFP_ALL); + } + simple_unlock(&uobj->vmobjlock); + + /* + * update the cached read creds for this node. + */ + + if (np->n_rcred) { + crfree(np->n_rcred); + } + np->n_rcred = curproc->p_ucred; + crhold(np->n_rcred); + + /* + * read the desired page(s). + */ + + totalbytes = npages << PAGE_SHIFT; + bytes = MIN(totalbytes, vp->v_uvm.u_size - startoffset); + tailbytes = totalbytes - bytes; + skipbytes = 0; + + kva = uvm_pagermapin(pgs, npages, UVMPAGER_MAPIN_WAITOK | + UVMPAGER_MAPIN_READ); + + s = splbio(); + mbp = pool_get(&bufpool, PR_WAITOK); + splx(s); + mbp->b_bufsize = totalbytes; + mbp->b_data = (void *)kva; + mbp->b_resid = mbp->b_bcount = bytes; + mbp->b_flags = B_BUSY|B_READ| (async ? B_CALL|B_ASYNC : 0); + mbp->b_iodone = uvm_aio_biodone; + mbp->b_vp = vp; + mbp->b_proc = NULL; /* XXXUBC */ + LIST_INIT(&mbp->b_dep); + + /* + * if EOF is in the middle of the last page, zero the part past EOF. + */ + + if (tailbytes > 0 && (pgs[bytes >> PAGE_SHIFT]->flags & PG_FAKE)) { + memset((char *)kva + bytes, 0, tailbytes); + } + + /* + * now loop over the pages, reading as needed. + */ + + bp = NULL; + for (offset = startoffset; + bytes > 0; + offset += iobytes, bytes -= iobytes) { + + /* + * skip pages which don't need to be read. + */ + + pidx = (offset - startoffset) >> PAGE_SHIFT; + UVMHIST_LOG(ubchist, "pidx %d offset 0x%x startoffset 0x%x", + pidx, (int)offset, (int)startoffset,0); + while ((pgs[pidx]->flags & PG_FAKE) == 0) { + size_t b; + + KASSERT((offset & (PAGE_SIZE - 1)) == 0); + b = MIN(PAGE_SIZE, bytes); + offset += b; + bytes -= b; + skipbytes += b; + pidx++; + UVMHIST_LOG(ubchist, "skipping, new offset 0x%x", + (int)offset, 0,0,0); + if (bytes == 0) { + goto loopdone; + } + } + + /* + * see how many pages can be read with this i/o. + * reduce the i/o size if necessary. + */ + + iobytes = bytes; + if (offset + iobytes > round_page(offset)) { + pcount = 1; + while (pidx + pcount < npages && + pgs[pidx + pcount]->flags & PG_FAKE) { + pcount++; + } + iobytes = MIN(iobytes, (pcount << PAGE_SHIFT) - + (offset - trunc_page(offset))); + } + iobytes = MIN(iobytes, nmp->nm_rsize); + + /* + * allocate a sub-buf for this piece of the i/o + * (or just use mbp if there's only 1 piece), + * and start it going. + */ + + if (offset == startoffset && iobytes == bytes) { + bp = mbp; + } else { + s = splbio(); + bp = pool_get(&bufpool, PR_WAITOK); + splx(s); + bp->b_data = (char *)kva + offset - startoffset; + bp->b_resid = bp->b_bcount = iobytes; + bp->b_flags = B_BUSY|B_READ|B_CALL|B_ASYNC; + bp->b_iodone = uvm_aio_biodone1; + bp->b_vp = vp; + bp->b_proc = NULL; /* XXXUBC */ + LIST_INIT(&bp->b_dep); + } + bp->b_private = mbp; + bp->b_lblkno = bp->b_blkno = offset >> DEV_BSHIFT; + + UVMHIST_LOG(ubchist, "bp %p offset 0x%x bcount 0x%x blkno 0x%x", + bp, offset, iobytes, bp->b_blkno); + + VOP_STRATEGY(bp); + } + +loopdone: + if (skipbytes) { + s = splbio(); + mbp->b_resid -= skipbytes; + if (mbp->b_resid == 0) { + biodone(mbp); + } + splx(s); + } + if (async) { + UVMHIST_LOG(ubchist, "returning PEND",0,0,0,0); + return EINPROGRESS; + } + if (bp != NULL) { + error = biowait(mbp); + } + s = splbio(); + pool_put(&bufpool, mbp); + splx(s); + uvm_pagermapout(kva, npages); + + if (write && v3) { +uncommit: + lockmgr(&np->n_commitlock, LK_EXCLUSIVE, NULL, p); + nfs_del_committed_range(vp, origoffset, npages); + nfs_del_tobecommitted_range(vp, origoffset, npages); + simple_lock(&uobj->vmobjlock); + for (i = 0; i < npages; i++) { + if (pgs[i] == NULL) { + continue; + } + pgs[i]->flags &= ~(PG_NEEDCOMMIT|PG_RDONLY); + } + simple_unlock(&uobj->vmobjlock); + lockmgr(&np->n_commitlock, LK_RELEASE, NULL, p); + } + + simple_lock(&uobj->vmobjlock); + +out: + if (error) { + uvm_lock_pageq(); + for (i = 0; i < npages; i++) { + if (pgs[i] == NULL) { + continue; + } + UVMHIST_LOG(ubchist, "examining pg %p flags 0x%x", + pgs[i], pgs[i]->flags, 0,0); + if (pgs[i]->flags & PG_WANTED) { + wakeup(pgs[i]); + } + if (pgs[i]->flags & PG_RELEASED) { + uvm_unlock_pageq(); + (uobj->pgops->pgo_releasepg)(pgs[i], NULL); + uvm_lock_pageq(); + continue; + } + if (pgs[i]->flags & PG_FAKE) { + uvm_pagefree(pgs[i]); + continue; + } + uvm_pageactivate(pgs[i]); + pgs[i]->flags &= ~(PG_WANTED|PG_BUSY); + UVM_PAGE_OWN(pgs[i], NULL); + } + uvm_unlock_pageq(); + simple_unlock(&uobj->vmobjlock); + UVMHIST_LOG(ubchist, "returning error %d", error,0,0,0); + return error; + } + + UVMHIST_LOG(ubchist, "ridx %d count %d", ridx, npages, 0,0); + uvm_lock_pageq(); + for (i = 0; i < npages; i++) { + if (pgs[i] == NULL) { + continue; + } + UVMHIST_LOG(ubchist, "examining pg %p flags 0x%x", + pgs[i], pgs[i]->flags, 0,0); + if (pgs[i]->flags & PG_FAKE) { + UVMHIST_LOG(ubchist, "unfaking pg %p offset 0x%x", + pgs[i], (int)pgs[i]->offset,0,0); + pgs[i]->flags &= ~(PG_FAKE); + pmap_clear_modify(pgs[i]); + pmap_clear_reference(pgs[i]); + } + if (i < ridx || i >= ridx + orignpages || async) { + UVMHIST_LOG(ubchist, "unbusy pg %p offset 0x%x", + pgs[i], (int)pgs[i]->offset,0,0); + if (pgs[i]->flags & PG_WANTED) { + wakeup(pgs[i]); + } + if (pgs[i]->flags & PG_RELEASED) { + uvm_unlock_pageq(); + (uobj->pgops->pgo_releasepg)(pgs[i], NULL); + uvm_lock_pageq(); + continue; + } + uvm_pageactivate(pgs[i]); + pgs[i]->flags &= ~(PG_WANTED|PG_BUSY); + UVM_PAGE_OWN(pgs[i], NULL); + } + } + uvm_unlock_pageq(); + simple_unlock(&uobj->vmobjlock); + if (ap->a_m != NULL) { + memcpy(ap->a_m, &pgs[ridx], + *ap->a_count * sizeof(struct vm_page *)); + } + return 0; +} + +/* + * Vnode op for VM putpages. + */ +int +nfs_putpages(v) + void *v; +{ + struct vop_putpages_args /* { + struct vnode *a_vp; + struct vm_page **a_m; + int a_count; + int a_flags; + int *a_rtvals; + } */ *ap = v; + + struct vnode *vp = ap->a_vp; + struct nfsnode *np = VTONFS(vp); + struct nfsmount *nmp = VFSTONFS(vp->v_mount); + struct buf *bp, *mbp; + struct vm_page **pgs = ap->a_m; + int flags = ap->a_flags; + int npages = ap->a_count; + int s, error, i; + size_t bytes, iobytes, skipbytes; + vaddr_t kva; + off_t offset, origoffset, commitoff; + uint32_t commitbytes; + boolean_t v3 = NFS_ISV3(vp); + boolean_t async = (flags & PGO_SYNCIO) == 0; + boolean_t weak = (flags & PGO_WEAK) && v3; + struct proc *p = curproc; + UVMHIST_FUNC("nfs_putpages"); UVMHIST_CALLED(ubchist); + + UVMHIST_LOG(ubchist, "vp %p pgp %p count %d", + vp, ap->a_m, ap->a_count,0); + + simple_unlock(&vp->v_uvm.u_obj.vmobjlock); + + error = 0; + origoffset = pgs[0]->offset; + bytes = MIN(ap->a_count << PAGE_SHIFT, vp->v_uvm.u_size - origoffset); + skipbytes = 0; + + /* + * if the range has been committed already, mark the pages thus. + * if the range just needs to be committed, we're done + * if it's a weak putpage, otherwise commit the range. + */ + + if (v3) { + lockmgr(&np->n_commitlock, LK_EXCLUSIVE, NULL, p); + if (nfs_in_committed_range(vp, origoffset, bytes)) { + goto committed; + } + if (nfs_in_tobecommitted_range(vp, origoffset, bytes)) { + if (weak) { + lockmgr(&np->n_commitlock, LK_RELEASE, NULL, p); + return 0; + } else { + commitoff = np->n_pushlo; + commitbytes = (uint32_t)(np->n_pushhi - + np->n_pushlo); + goto commit; + } + } + lockmgr(&np->n_commitlock, LK_RELEASE, NULL, p); + } + + /* + * otherwise write or commit all the pages. + */ + + kva = uvm_pagermapin(pgs, ap->a_count, UVMPAGER_MAPIN_WAITOK| + UVMPAGER_MAPIN_WRITE); + + s = splbio(); + vp->v_numoutput += 2; + mbp = pool_get(&bufpool, PR_WAITOK); + UVMHIST_LOG(ubchist, "vp %p mbp %p num now %d bytes 0x%x", + vp, mbp, vp->v_numoutput, bytes); + splx(s); + mbp->b_bufsize = npages << PAGE_SHIFT; + mbp->b_data = (void *)kva; + mbp->b_resid = mbp->b_bcount = bytes; + mbp->b_flags = B_BUSY|B_WRITE|B_AGE | + (async ? B_CALL|B_ASYNC : 0) | + (curproc == uvm.pagedaemon_proc ? B_PDAEMON : 0); + mbp->b_iodone = uvm_aio_biodone; + mbp->b_vp = vp; + mbp->b_proc = NULL; /* XXXUBC */ + LIST_INIT(&mbp->b_dep); + + for (offset = origoffset; + bytes > 0; + offset += iobytes, bytes -= iobytes) { + iobytes = MIN(nmp->nm_wsize, bytes); + + /* + * skip writing any pages which only need a commit. + */ + + if ((pgs[(offset - origoffset) >> PAGE_SHIFT]->flags & + PG_NEEDCOMMIT) != 0) { + KASSERT((offset & (PAGE_SIZE - 1)) == 0); + iobytes = MIN(PAGE_SIZE, bytes); + skipbytes += iobytes; + continue; + } + + /* if it's really one i/o, don't make a second buf */ + if (offset == origoffset && iobytes == bytes) { + bp = mbp; + } else { + s = splbio(); + vp->v_numoutput++; + bp = pool_get(&bufpool, PR_WAITOK); + UVMHIST_LOG(ubchist, "vp %p bp %p num now %d", + vp, bp, vp->v_numoutput, 0); + splx(s); + bp->b_data = (char *)kva + (offset - origoffset); + bp->b_resid = bp->b_bcount = iobytes; + bp->b_flags = B_BUSY|B_WRITE|B_CALL|B_ASYNC; + bp->b_iodone = uvm_aio_biodone1; + bp->b_vp = vp; + bp->b_proc = NULL; /* XXXUBC */ + LIST_INIT(&bp->b_dep); + } + bp->b_private = mbp; + bp->b_lblkno = bp->b_blkno = (daddr_t)(offset >> DEV_BSHIFT); + UVMHIST_LOG(ubchist, "bp %p numout %d", + bp, vp->v_numoutput,0,0); + VOP_STRATEGY(bp); + } + if (skipbytes) { + UVMHIST_LOG(ubchist, "skipbytes %d", bytes, 0,0,0); + s = splbio(); + mbp->b_resid -= skipbytes; + if (mbp->b_resid == 0) { + biodone(mbp); + } + splx(s); + } + if (async) { + return EINPROGRESS; + } + if (bp != NULL) { + error = biowait(mbp); + } + + s = splbio(); + if (mbp->b_vp) + vwakeup(mbp->b_vp); + pool_put(&bufpool, mbp); + splx(s); + + uvm_pagermapout(kva, ap->a_count); + if (error || !v3) { + UVMHIST_LOG(ubchist, "returning error %d", error, 0,0,0); + return error; + } + + /* + * for a weak put, mark the range as "to be committed" + * and mark the pages read-only so that we will be notified + * to remove the pages from the "to be committed" range + * if they are made dirty again. + * for a strong put, commit the pages and remove them from the + * "to be committed" range. also, mark them as writable + * and not cleanable with just a commit. + */ + + lockmgr(&np->n_commitlock, LK_EXCLUSIVE, NULL, p); + if (weak) { + nfs_add_tobecommitted_range(vp, origoffset, + npages << PAGE_SHIFT); + for (i = 0; i < npages; i++) { + pgs[i]->flags |= PG_NEEDCOMMIT|PG_RDONLY; + } + } else { + commitoff = origoffset; + commitbytes = npages << PAGE_SHIFT; +commit: + error = nfs_commit(vp, commitoff, commitbytes, curproc); + nfs_del_tobecommitted_range(vp, commitoff, commitbytes); +committed: + for (i = 0; i < npages; i++) { + pgs[i]->flags &= ~(PG_NEEDCOMMIT|PG_RDONLY); + } + } + lockmgr(&np->n_commitlock, LK_RELEASE, NULL, p); + return error; +} diff --git a/sys/nfs/nfs_node.c b/sys/nfs/nfs_node.c index 987259eadc3..567738584da 100644 --- a/sys/nfs/nfs_node.c +++ b/sys/nfs/nfs_node.c @@ -1,4 +1,4 @@ -/* $OpenBSD: nfs_node.c,v 1.16 2001/11/15 23:15:15 art Exp $ */ +/* $OpenBSD: nfs_node.c,v 1.17 2001/11/27 05:27:12 art Exp $ */ /* $NetBSD: nfs_node.c,v 1.16 1996/02/18 11:53:42 fvdl Exp $ */ /* @@ -145,6 +145,7 @@ loop: vp = nvp; np = pool_get(&nfs_node_pool, PR_WAITOK); bzero((caddr_t)np, sizeof *np); + lockinit(&np->n_commitlock, PINOD, "nfsclock", 0, 0); vp->v_data = np; np->n_vnode = vp; @@ -169,6 +170,17 @@ loop: np->n_fhp = &np->n_fh; bcopy((caddr_t)fhp, (caddr_t)np->n_fhp, fhsize); np->n_fhsize = fhsize; + + /* + * XXXUBC doing this while holding the nfs_hashlock is bad, + * but there's no alternative at the moment. + */ + error = VOP_GETATTR(vp, &np->n_vattr, curproc->p_ucred, curproc); + if (error) { + return error; + } + uvm_vnp_setsize(vp, np->n_vattr.va_size); + lockmgr(&nfs_hashlock, LK_RELEASE, 0, p); *npp = np; return (0); @@ -185,11 +197,12 @@ nfs_inactive(v) struct nfsnode *np; struct sillyrename *sp; struct proc *p = curproc; /* XXX */ + struct vnode *vp = ap->a_vp; - np = VTONFS(ap->a_vp); - if (prtactive && ap->a_vp->v_usecount != 0) - vprint("nfs_inactive: pushing active", ap->a_vp); - if (ap->a_vp->v_type != VDIR) { + np = VTONFS(vp); + if (prtactive && vp->v_usecount != 0) + vprint("nfs_inactive: pushing active", vp); + if (vp->v_type != VDIR) { sp = np->n_sillyrename; np->n_sillyrename = (struct sillyrename *)0; } else @@ -198,7 +211,7 @@ nfs_inactive(v) /* * Remove the silly file that was rename'd earlier */ - (void) nfs_vinvalbuf(ap->a_vp, 0, sp->s_cred, p, 1); + (void) nfs_vinvalbuf(vp, 0, sp->s_cred, p, 1); nfs_removeit(sp); crfree(sp->s_cred); vrele(sp->s_dvp); @@ -206,7 +219,7 @@ nfs_inactive(v) } np->n_flag &= (NMODIFIED | NFLUSHINPROG | NFLUSHWANT); - VOP_UNLOCK(ap->a_vp, 0, ap->a_p); + VOP_UNLOCK(vp, 0, ap->a_p); return (0); } diff --git a/sys/nfs/nfs_serv.c b/sys/nfs/nfs_serv.c index 9d4de9fd9a1..9534e7221da 100644 --- a/sys/nfs/nfs_serv.c +++ b/sys/nfs/nfs_serv.c @@ -1,4 +1,4 @@ -/* $OpenBSD: nfs_serv.c,v 1.27 2001/11/06 19:53:21 miod Exp $ */ +/* $OpenBSD: nfs_serv.c,v 1.28 2001/11/27 05:27:12 art Exp $ */ /* $NetBSD: nfs_serv.c,v 1.34 1997/05/12 23:37:12 fvdl Exp $ */ /* @@ -1663,8 +1663,6 @@ nfsrv_remove(nfsd, slp, procp, mrq) error = EBUSY; goto out; } - if (vp->v_flag & VTEXT) - uvm_vnp_uncache(vp); out: if (!error) { error = VOP_REMOVE(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd); @@ -3276,11 +3274,10 @@ nfsrv_access(vp, flags, cred, rdonly, p, override) } } /* - * If there's shared text associated with - * the inode, try to free it up once. If - * we fail, we can't allow writing. + * If the vnode is in use as a process's text, + * we can't allow writing. */ - if ((vp->v_flag & VTEXT) && !uvm_vnp_uncache(vp)) + if ((vp->v_flag & VTEXT)) return (ETXTBSY); } error = VOP_ACCESS(vp, flags, cred, p); diff --git a/sys/nfs/nfs_subs.c b/sys/nfs/nfs_subs.c index 9689d9f36a5..4a8bc11528d 100644 --- a/sys/nfs/nfs_subs.c +++ b/sys/nfs/nfs_subs.c @@ -1,4 +1,4 @@ -/* $OpenBSD: nfs_subs.c,v 1.35 2001/11/06 19:53:21 miod Exp $ */ +/* $OpenBSD: nfs_subs.c,v 1.36 2001/11/27 05:27:12 art Exp $ */ /* $NetBSD: nfs_subs.c,v 1.27.4.3 1996/07/08 20:34:24 jtc Exp $ */ /* @@ -39,6 +39,40 @@ * @(#)nfs_subs.c 8.8 (Berkeley) 5/22/95 */ +/* + * Copyright 2000 Wasabi Systems, Inc. + * All rights reserved. + * + * Written by Frank van der Linden for Wasabi Systems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed for the NetBSD Project by + * Wasabi Systems, Inc. + * 4. The name of Wasabi Systems, Inc. may not be used to endorse + * or promote products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ /* * These functions support the macros and help fiddle mbuf chains for @@ -1241,17 +1275,14 @@ nfs_loadattrcache(vpp, mdp, dposp, vaper) vap->va_filerev = 0; } if (vap->va_size != np->n_size) { - if (vap->va_type == VREG) { - if (np->n_flag & NMODIFIED) { - if (vap->va_size < np->n_size) - vap->va_size = np->n_size; - else - np->n_size = vap->va_size; - } else - np->n_size = vap->va_size; - uvm_vnp_setsize(vp, np->n_size); - } else + if ((np->n_flag & NMODIFIED) && vap->va_size < np->n_size) { + vap->va_size = np->n_size; + } else { np->n_size = vap->va_size; + if (vap->va_type == VREG) { + uvm_vnp_setsize(vp, np->n_size); + } + } } np->n_attrstamp = time.tv_sec; if (vaper != NULL) { @@ -1741,26 +1772,216 @@ void nfs_clearcommit(mp) struct mount *mp; { - register struct vnode *vp, *nvp; - register struct buf *bp, *nbp; + struct vnode *vp; + struct vm_page *pg; + struct nfsnode *np; int s; s = splbio(); -loop: - for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) { - if (vp->v_mount != mp) /* Paranoia */ - goto loop; - nvp = vp->v_mntvnodes.le_next; - for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = nbp) { - nbp = bp->b_vnbufs.le_next; - if ((bp->b_flags & (B_BUSY | B_DELWRI | B_NEEDCOMMIT)) - == (B_DELWRI | B_NEEDCOMMIT)) - bp->b_flags &= ~B_NEEDCOMMIT; + LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) { + if (vp->v_type == VNON) + continue; + np = VTONFS(vp); + np->n_pushlo = np->n_pushhi = np->n_pushedlo = + np->n_pushedhi = 0; + np->n_commitflags &= + ~(NFS_COMMIT_PUSH_VALID | NFS_COMMIT_PUSHED_VALID); + simple_lock(&vp->v_uvm.u_obj.vmobjlock); + TAILQ_FOREACH(pg, &vp->v_uvm.u_obj.memq, listq) { + pg->flags &= ~PG_NEEDCOMMIT; } + simple_unlock(&vp->v_uvm.u_obj.vmobjlock); } splx(s); } +void +nfs_merge_commit_ranges(vp) + struct vnode *vp; +{ + struct nfsnode *np = VTONFS(vp); + + if (!(np->n_commitflags & NFS_COMMIT_PUSHED_VALID)) { + np->n_pushedlo = np->n_pushlo; + np->n_pushedhi = np->n_pushhi; + np->n_commitflags |= NFS_COMMIT_PUSHED_VALID; + } else { + if (np->n_pushlo < np->n_pushedlo) + np->n_pushedlo = np->n_pushlo; + if (np->n_pushhi > np->n_pushedhi) + np->n_pushedhi = np->n_pushhi; + } + + np->n_pushlo = np->n_pushhi = 0; + np->n_commitflags &= ~NFS_COMMIT_PUSH_VALID; + +#ifdef fvdl_debug + printf("merge: committed: %u - %u\n", (unsigned)np->n_pushedlo, + (unsigned)np->n_pushedhi); +#endif +} + +int +nfs_in_committed_range(vp, off, len) + struct vnode *vp; + off_t off, len; +{ + struct nfsnode *np = VTONFS(vp); + off_t lo, hi; + + if (!(np->n_commitflags & NFS_COMMIT_PUSHED_VALID)) + return 0; + lo = off; + hi = lo + len; + + return (lo >= np->n_pushedlo && hi <= np->n_pushedhi); +} + +int +nfs_in_tobecommitted_range(vp, off, len) + struct vnode *vp; + off_t off, len; +{ + struct nfsnode *np = VTONFS(vp); + off_t lo, hi; + + if (!(np->n_commitflags & NFS_COMMIT_PUSH_VALID)) + return 0; + lo = off; + hi = lo + len; + + return (lo >= np->n_pushlo && hi <= np->n_pushhi); +} + +void +nfs_add_committed_range(vp, off, len) + struct vnode *vp; + off_t off, len; +{ + struct nfsnode *np = VTONFS(vp); + off_t lo, hi; + + lo = off; + hi = lo + len; + + if (!(np->n_commitflags & NFS_COMMIT_PUSHED_VALID)) { + np->n_pushedlo = lo; + np->n_pushedhi = hi; + np->n_commitflags |= NFS_COMMIT_PUSHED_VALID; + } else { + if (hi > np->n_pushedhi) + np->n_pushedhi = hi; + if (lo < np->n_pushedlo) + np->n_pushedlo = lo; + } +#ifdef fvdl_debug + printf("add: committed: %u - %u\n", (unsigned)np->n_pushedlo, + (unsigned)np->n_pushedhi); +#endif +} + +void +nfs_del_committed_range(vp, off, len) + struct vnode *vp; + off_t off, len; +{ + struct nfsnode *np = VTONFS(vp); + off_t lo, hi; + + if (!(np->n_commitflags & NFS_COMMIT_PUSHED_VALID)) + return; + + lo = off; + hi = lo + len; + + if (lo > np->n_pushedhi || hi < np->n_pushedlo) + return; + if (lo <= np->n_pushedlo) + np->n_pushedlo = hi; + else if (hi >= np->n_pushedhi) + np->n_pushedhi = lo; + else { + /* + * XXX There's only one range. If the deleted range + * is in the middle, pick the largest of the + * contiguous ranges that it leaves. + */ + if ((np->n_pushedlo - lo) > (hi - np->n_pushedhi)) + np->n_pushedhi = lo; + else + np->n_pushedlo = hi; + } +#ifdef fvdl_debug + printf("del: committed: %u - %u\n", (unsigned)np->n_pushedlo, + (unsigned)np->n_pushedhi); +#endif +} + +void +nfs_add_tobecommitted_range(vp, off, len) + struct vnode *vp; + off_t off, len; +{ + struct nfsnode *np = VTONFS(vp); + off_t lo, hi; + + lo = off; + hi = lo + len; + + if (!(np->n_commitflags & NFS_COMMIT_PUSH_VALID)) { + np->n_pushlo = lo; + np->n_pushhi = hi; + np->n_commitflags |= NFS_COMMIT_PUSH_VALID; + } else { + if (lo < np->n_pushlo) + np->n_pushlo = lo; + if (hi > np->n_pushhi) + np->n_pushhi = hi; + } +#ifdef fvdl_debug + printf("add: tobecommitted: %u - %u\n", (unsigned)np->n_pushlo, + (unsigned)np->n_pushhi); +#endif +} + +void +nfs_del_tobecommitted_range(vp, off, len) + struct vnode *vp; + off_t off, len; +{ + struct nfsnode *np = VTONFS(vp); + off_t lo, hi; + + if (!(np->n_commitflags & NFS_COMMIT_PUSH_VALID)) + return; + + lo = off; + hi = lo + len; + + if (lo > np->n_pushhi || hi < np->n_pushlo) + return; + + if (lo <= np->n_pushlo) + np->n_pushlo = hi; + else if (hi >= np->n_pushhi) + np->n_pushhi = lo; + else { + /* + * XXX There's only one range. If the deleted range + * is in the middle, pick the largest of the + * contiguous ranges that it leaves. + */ + if ((np->n_pushlo - lo) > (hi - np->n_pushhi)) + np->n_pushhi = lo; + else + np->n_pushlo = hi; + } +#ifdef fvdl_debug + printf("del: tobecommitted: %u - %u\n", (unsigned)np->n_pushlo, + (unsigned)np->n_pushhi); +#endif +} + /* * Map errnos to NFS error numbers. For Version 3 also filter out error * numbers not specified for the associated procedure. diff --git a/sys/nfs/nfs_syscalls.c b/sys/nfs/nfs_syscalls.c index c71a662ccb2..5a189ba344d 100644 --- a/sys/nfs/nfs_syscalls.c +++ b/sys/nfs/nfs_syscalls.c @@ -1,4 +1,4 @@ -/* $OpenBSD: nfs_syscalls.c,v 1.20 2001/11/15 23:15:15 art Exp $ */ +/* $OpenBSD: nfs_syscalls.c,v 1.21 2001/11/27 05:27:12 art Exp $ */ /* $NetBSD: nfs_syscalls.c,v 1.19 1996/02/18 11:53:52 fvdl Exp $ */ /* @@ -913,10 +913,9 @@ int nfssvc_iod(p) struct proc *p; { - register struct buf *bp, *nbp; - register int i, myiod; - struct vnode *vp; - int error = 0, s; + struct buf *bp; + int i, myiod; + int error = 0; /* * Assign my position or return error if too many already running @@ -944,39 +943,7 @@ nfssvc_iod(p) while ((bp = nfs_bufq.tqh_first) != NULL) { /* Take one off the front of the list */ TAILQ_REMOVE(&nfs_bufq, bp, b_freelist); - if (bp->b_flags & B_READ) - (void) nfs_doio(bp, NULL); - else do { - /* - * Look for a delayed write for the same vnode, so I can do - * it now. We must grab it before calling nfs_doio() to - * avoid any risk of the vnode getting vclean()'d while - * we are doing the write rpc. - */ - vp = bp->b_vp; - s = splbio(); - for (nbp = vp->v_dirtyblkhd.lh_first; nbp; - nbp = nbp->b_vnbufs.le_next) { - if ((nbp->b_flags & - (B_BUSY|B_DELWRI|B_NEEDCOMMIT|B_NOCACHE))!=B_DELWRI) - continue; - bremfree(nbp); - nbp->b_flags |= (B_BUSY|B_ASYNC); - break; - } - /* - * For the delayed write, do the first part of nfs_bwrite() - * up to, but not including nfs_strategy(). - */ - if (nbp) { - nbp->b_flags &= ~(B_READ|B_DONE|B_ERROR); - buf_undirty(bp); - nbp->b_vp->v_numoutput++; - } - splx(s); - - (void) nfs_doio(bp, NULL); - } while ((bp = nbp) != NULL); + (void) nfs_doio(bp, NULL); } if (error) { PRELE(p); diff --git a/sys/nfs/nfs_var.h b/sys/nfs/nfs_var.h index 861eaf3059e..71985e581a8 100644 --- a/sys/nfs/nfs_var.h +++ b/sys/nfs/nfs_var.h @@ -1,4 +1,4 @@ -/* $OpenBSD: nfs_var.h,v 1.15 2001/11/15 23:15:15 art Exp $ */ +/* $OpenBSD: nfs_var.h,v 1.16 2001/11/27 05:27:12 art Exp $ */ /* $NetBSD: nfs_var.h,v 1.3 1996/02/18 11:53:54 fvdl Exp $ */ /* @@ -119,7 +119,7 @@ int nfs_sillyrename __P((struct vnode *, struct vnode *, struct componentname *)); int nfs_lookitup __P((struct vnode *, char *, int, struct ucred *, struct proc *, struct nfsnode **)); -int nfs_commit __P((struct vnode *, u_quad_t, int, struct proc *)); +int nfs_commit __P((struct vnode *, u_quad_t, unsigned, struct proc *)); int nfs_bmap __P((void *)); int nfs_strategy __P((void *)); int nfs_mmap __P((void *)); @@ -134,7 +134,6 @@ int nfs_vfree __P((void *)); int nfs_truncate __P((void *)); int nfs_update __P((void *)); int nfs_bwrite __P((void *)); -int nfs_writebp __P((struct buf *, int)); int nfsspec_access __P((void *)); int nfsspec_read __P((void *)); int nfsspec_write __P((void *)); @@ -258,7 +257,16 @@ void nfsm_srvfattr __P((struct nfsrv_descript *, struct vattr *, int nfsrv_fhtovp __P((fhandle_t *, int, struct vnode **, struct ucred *, struct nfssvc_sock *, struct mbuf *, int *, int)); int netaddr_match __P((int, union nethostaddr *, struct mbuf *)); + void nfs_clearcommit __P((struct mount *)); +void nfs_merge_commit_ranges __P((struct vnode *)); +int nfs_in_committed_range __P((struct vnode *, off_t, off_t)); +int nfs_in_tobecommitted_range __P((struct vnode *, off_t, off_t)); +void nfs_add_committed_range __P((struct vnode *, off_t, off_t)); +void nfs_del_committed_range __P((struct vnode *, off_t, off_t)); +void nfs_add_tobecommitted_range __P((struct vnode *, off_t, off_t)); +void nfs_del_tobecommitted_range __P((struct vnode *, off_t, off_t)); + int nfsrv_errmap __P((struct nfsrv_descript *, int)); void nfsrvw_sort __P((gid_t *, int)); void nfsrv_setcred __P((struct ucred *, struct ucred *)); diff --git a/sys/nfs/nfs_vfsops.c b/sys/nfs/nfs_vfsops.c index 13420530fc3..91f84da52b6 100644 --- a/sys/nfs/nfs_vfsops.c +++ b/sys/nfs/nfs_vfsops.c @@ -1,4 +1,4 @@ -/* $OpenBSD: nfs_vfsops.c,v 1.38 2001/11/14 23:37:33 mickey Exp $ */ +/* $OpenBSD: nfs_vfsops.c,v 1.39 2001/11/27 05:27:12 art Exp $ */ /* $NetBSD: nfs_vfsops.c,v 1.46.4.1 1996/05/25 22:40:35 fvdl Exp $ */ /* @@ -748,6 +748,8 @@ mountnfs(argp, mp, nam, pth, hst) * point. */ mp->mnt_stat.f_iosize = NFS_MAXDGRAMDATA; + mp->mnt_fs_bshift = DEV_BSHIFT; + mp->mnt_dev_bshift = -1; return (0); bad: @@ -856,8 +858,9 @@ loop: */ if (vp->v_mount != mp) goto loop; - if (VOP_ISLOCKED(vp) || vp->v_dirtyblkhd.lh_first == NULL || - waitfor == MNT_LAZY) + if (waitfor == MNT_LAZY || VOP_ISLOCKED(vp) || + (LIST_EMPTY(&vp->v_dirtyblkhd) && + vp->v_uvm.u_obj.uo_npages == 0)) continue; if (vget(vp, LK_EXCLUSIVE, p)) goto loop; diff --git a/sys/nfs/nfs_vnops.c b/sys/nfs/nfs_vnops.c index 0813b439cb2..4c176c1c1ec 100644 --- a/sys/nfs/nfs_vnops.c +++ b/sys/nfs/nfs_vnops.c @@ -1,4 +1,4 @@ -/* $OpenBSD: nfs_vnops.c,v 1.39 2001/11/15 23:15:15 art Exp $ */ +/* $OpenBSD: nfs_vnops.c,v 1.40 2001/11/27 05:27:12 art Exp $ */ /* $NetBSD: nfs_vnops.c,v 1.62.4.1 1996/07/08 20:26:52 jtc Exp $ */ /* @@ -126,7 +126,9 @@ struct vnodeopv_entry_desc nfsv2_vnodeop_entries[] = { { &vop_advlock_desc, nfs_advlock }, /* advlock */ { &vop_reallocblks_desc, nfs_reallocblks }, /* reallocblks */ { &vop_bwrite_desc, nfs_bwrite }, - { (struct vnodeop_desc*)NULL, (int(*) __P((void *)))NULL } + { &vop_getpages_desc, nfs_getpages }, /* getpages */ + { &vop_putpages_desc, nfs_putpages }, /* putpages */ + { NULL, NULL } }; struct vnodeopv_desc nfsv2_vnodeop_opv_desc = { &nfsv2_vnodeop_p, nfsv2_vnodeop_entries }; @@ -151,7 +153,7 @@ struct vnodeopv_entry_desc spec_nfsv2nodeop_entries[] = { { &vop_ioctl_desc, spec_ioctl }, /* ioctl */ { &vop_select_desc, spec_select }, /* select */ { &vop_revoke_desc, spec_revoke }, /* revoke */ - { &vop_fsync_desc, nfs_fsync }, /* fsync */ + { &vop_fsync_desc, spec_fsync }, /* fsync */ { &vop_remove_desc, spec_remove }, /* remove */ { &vop_link_desc, spec_link }, /* link */ { &vop_rename_desc, spec_rename }, /* rename */ @@ -373,11 +375,30 @@ nfs_open(v) return (EACCES); } + /* + * Initialize read and write creds here, for swapfiles + * and other paths that don't set the creds themselves. + */ + + if (ap->a_mode & FREAD) { + if (np->n_rcred) { + crfree(np->n_rcred); + } + np->n_rcred = ap->a_cred; + crhold(np->n_rcred); + } + if (ap->a_mode & FWRITE) { + if (np->n_wcred) { + crfree(np->n_wcred); + } + np->n_wcred = ap->a_cred; + crhold(np->n_wcred); + } + if (np->n_flag & NMODIFIED) { if ((error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_p, 1)) == EINTR) return (error); - uvm_vnp_uncache(vp); np->n_attrstamp = 0; if (vp->v_type == VDIR) np->n_direofoffset = 0; @@ -395,7 +416,6 @@ nfs_open(v) if ((error = nfs_vinvalbuf(vp, V_SAVE, ap->a_cred, ap->a_p, 1)) == EINTR) return (error); - uvm_vnp_uncache(vp); np->n_mtime = vattr.va_mtime.tv_sec; } } @@ -2511,7 +2531,7 @@ int nfs_commit(vp, offset, cnt, procp) struct vnode *vp; u_quad_t offset; - int cnt; + unsigned cnt; struct proc *procp; { caddr_t cp; @@ -2626,9 +2646,7 @@ nfs_fsync(v) } /* - * Flush all the blocks associated with a vnode. - * Walk through the buffer pool and push any dirty pages - * associated with the vnode. + * Flush all the data associated with a vnode. */ int nfs_flush(vp, cred, waitfor, p, commit) @@ -2638,154 +2656,19 @@ nfs_flush(vp, cred, waitfor, p, commit) struct proc *p; int commit; { + struct uvm_object *uobj = &vp->v_uvm.u_obj; struct nfsnode *np = VTONFS(vp); - struct buf *bp; - int i; - struct buf *nbp; - struct nfsmount *nmp = VFSTONFS(vp->v_mount); - int s, error = 0, slptimeo = 0, slpflag = 0, retv, bvecpos; - int passone = 1; - u_quad_t off = (u_quad_t)-1, endoff = 0, toff; -#ifndef NFS_COMMITBVECSIZ -#define NFS_COMMITBVECSIZ 20 -#endif - struct buf *bvec[NFS_COMMITBVECSIZ]; + int error; + int flushflags = PGO_ALLPAGES|PGO_CLEANIT|PGO_SYNCIO; + int rv; - if (nmp->nm_flag & NFSMNT_INT) - slpflag = PCATCH; - if (!commit) - passone = 0; - /* - * A b_flags == (B_DELWRI | B_NEEDCOMMIT) block has been written to the - * server, but nas not been committed to stable storage on the server - * yet. On the first pass, the byte range is worked out and the commit - * rpc is done. On the second pass, nfs_writebp() is called to do the - * job. - */ -again: - bvecpos = 0; - if (NFS_ISV3(vp) && commit) { - s = splbio(); - for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = nbp) { - nbp = bp->b_vnbufs.le_next; - if (bvecpos >= NFS_COMMITBVECSIZ) - break; - if ((bp->b_flags & (B_BUSY | B_DELWRI | B_NEEDCOMMIT)) - != (B_DELWRI | B_NEEDCOMMIT)) - continue; - bremfree(bp); - bp->b_flags |= (B_BUSY | B_WRITEINPROG); - /* - * A list of these buffers is kept so that the - * second loop knows which buffers have actually - * been committed. This is necessary, since there - * may be a race between the commit rpc and new - * uncommitted writes on the file. - */ - bvec[bvecpos++] = bp; - toff = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + - bp->b_dirtyoff; - if (toff < off) - off = toff; - toff += (u_quad_t)(bp->b_dirtyend - bp->b_dirtyoff); - if (toff > endoff) - endoff = toff; - } - splx(s); - } - if (bvecpos > 0) { - /* - * Commit data on the server, as required. - */ - retv = nfs_commit(vp, off, (int)(endoff - off), p); - if (retv == NFSERR_STALEWRITEVERF) - nfs_clearcommit(vp->v_mount); - /* - * Now, either mark the blocks I/O done or mark the - * blocks dirty, depending on whether the commit - * succeeded. - */ - for (i = 0; i < bvecpos; i++) { - bp = bvec[i]; - bp->b_flags &= ~(B_NEEDCOMMIT | B_WRITEINPROG); - if (retv) - brelse(bp); - else { - s = splbio(); - buf_undirty(bp); - vp->v_numoutput++; - bp->b_flags |= B_ASYNC; - bp->b_flags &= ~(B_READ|B_DONE|B_ERROR); - bp->b_dirtyoff = bp->b_dirtyend = 0; - splx(s); - biodone(bp); - } - } - } + error = 0; - /* - * Start/do any write(s) that are required. - */ -loop: - s = splbio(); - for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = nbp) { - nbp = bp->b_vnbufs.le_next; - if (bp->b_flags & B_BUSY) { - if (waitfor != MNT_WAIT || passone) - continue; - bp->b_flags |= B_WANTED; - error = tsleep((caddr_t)bp, slpflag | (PRIBIO + 1), - "nfsfsync", slptimeo); - splx(s); - if (error) { - if (nfs_sigintr(nmp, (struct nfsreq *)0, p)) - return (EINTR); - if (slpflag == PCATCH) { - slpflag = 0; - slptimeo = 2 * hz; - } - } - goto loop; - } - if ((bp->b_flags & B_DELWRI) == 0) - panic("nfs_fsync: not dirty"); - if ((passone || !commit) && (bp->b_flags & B_NEEDCOMMIT)) - continue; - bremfree(bp); - if (passone || !commit) - bp->b_flags |= (B_BUSY|B_ASYNC); - else - bp->b_flags |= (B_BUSY|B_ASYNC|B_WRITEINPROG|B_NEEDCOMMIT); - splx(s); - VOP_BWRITE(bp); - goto loop; - } - splx(s); - if (passone) { - passone = 0; - goto again; - } - if (waitfor == MNT_WAIT) { - loop2: - s = splbio(); - error = vwaitforio(vp, slpflag, "nfs_fsync", slptimeo); - splx(s); - if (error) { - if (nfs_sigintr(nmp, (struct nfsreq *)0, p)) - return (EINTR); - if (slpflag == PCATCH) { - slpflag = 0; - slptimeo = 2 * hz; - } - goto loop2; - } - - if (vp->v_dirtyblkhd.lh_first && commit) { -#if 0 - vprint("nfs_fsync: dirty", vp); -#endif - goto loop; - } + simple_lock(&uobj->vmobjlock); + rv = (uobj->pgops->pgo_flush)(uobj, 0, 0, flushflags); + simple_unlock(&uobj->vmobjlock); + if (!rv) { + error = EIO; } if (np->n_flag & NWRITEERR) { error = np->n_error; @@ -2860,7 +2743,7 @@ nfs_print(v) } /* - * Just call nfs_writebp() with the force argument set to 1. + * Just call bwrite(). */ int nfs_bwrite(v) @@ -2870,76 +2753,7 @@ nfs_bwrite(v) struct buf *a_bp; } */ *ap = v; - return (nfs_writebp(ap->a_bp, 1)); -} - -/* - * This is a clone of vop_generic_bwrite(), except that B_WRITEINPROG isn't set unless - * the force flag is one and it also handles the B_NEEDCOMMIT flag. - */ -int -nfs_writebp(bp, force) - register struct buf *bp; - int force; -{ - register int oldflags = bp->b_flags, retv = 1; - register struct proc *p = curproc; /* XXX */ - off_t off; - int s; - - if(!(bp->b_flags & B_BUSY)) - panic("bwrite: buffer is not busy???"); - -#ifdef fvdl_debug - printf("nfs_writebp(%x): vp %x voff %d vend %d doff %d dend %d\n", - bp, bp->b_vp, bp->b_validoff, bp->b_validend, bp->b_dirtyoff, - bp->b_dirtyend); -#endif - bp->b_flags &= ~(B_READ|B_DONE|B_ERROR); - - s = splbio(); - buf_undirty(bp); - - if ((oldflags & B_ASYNC) && !(oldflags & B_DELWRI) && p) - ++p->p_stats->p_ru.ru_oublock; - - bp->b_vp->v_numoutput++; - splx(s); - - /* - * If B_NEEDCOMMIT is set, a commit rpc may do the trick. If not - * an actual write will have to be scheduled via. VOP_STRATEGY(). - * If B_WRITEINPROG is already set, then push it with a write anyhow. - */ - if ((oldflags & (B_NEEDCOMMIT | B_WRITEINPROG)) == B_NEEDCOMMIT) { - off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff; - bp->b_flags |= B_WRITEINPROG; - retv = nfs_commit(bp->b_vp, off, bp->b_dirtyend-bp->b_dirtyoff, - bp->b_proc); - bp->b_flags &= ~B_WRITEINPROG; - if (!retv) { - bp->b_dirtyoff = bp->b_dirtyend = 0; - bp->b_flags &= ~B_NEEDCOMMIT; - biodone(bp); - } else if (retv == NFSERR_STALEWRITEVERF) - nfs_clearcommit(bp->b_vp->v_mount); - } - if (retv) { - if (force) - bp->b_flags |= B_WRITEINPROG; - VOP_STRATEGY(bp); - } - - if( (oldflags & B_ASYNC) == 0) { - int rtval = biowait(bp); - if (!(oldflags & B_DELWRI) && p) { - ++p->p_stats->p_ru.ru_oublock; - } - brelse(bp); - return (rtval); - } - - return (0); + return (bwrite(ap->a_bp)); } /* diff --git a/sys/nfs/nfsnode.h b/sys/nfs/nfsnode.h index e1e0fd64327..42aaddfa637 100644 --- a/sys/nfs/nfsnode.h +++ b/sys/nfs/nfsnode.h @@ -1,4 +1,4 @@ -/* $OpenBSD: nfsnode.h,v 1.11 2001/11/15 23:15:15 art Exp $ */ +/* $OpenBSD: nfsnode.h,v 1.12 2001/11/27 05:27:12 art Exp $ */ /* $NetBSD: nfsnode.h,v 1.16 1996/02/18 11:54:04 fvdl Exp $ */ /* @@ -119,8 +119,20 @@ struct nfsnode { nfsfh_t n_fh; /* Small File Handle */ struct ucred *n_rcred; struct ucred *n_wcred; + off_t n_pushedlo; /* 1st blk in commited range */ + off_t n_pushedhi; /* Last block in range */ + off_t n_pushlo; /* 1st block in commit range */ + off_t n_pushhi; /* Last block in range */ + struct lock n_commitlock; /* Serialize commits XXX */ + int n_commitflags; }; +/* + * Values for n_commitflags + */ +#define NFS_COMMIT_PUSH_VALID 0x0001 /* push range valid */ +#define NFS_COMMIT_PUSHED_VALID 0x0002 /* pushed range valid */ + #define n_atim n_un1.nf_atim #define n_mtim n_un2.nf_mtim #define n_sillyrename n_un3.nf_silly @@ -199,6 +211,8 @@ int nfs_bwrite __P((void *)); int nfs_vget __P((struct mount *, ino_t, struct vnode **)); #define nfs_reallocblks \ ((int (*) __P((void *)))eopnotsupp) +int nfs_getpages __P((void *)); +int nfs_putpages __P((void *)); /* other stuff */ int nfs_removeit __P((struct sillyrename *)); diff --git a/sys/sys/buf.h b/sys/sys/buf.h index cabdcbbe084..054a07c24d5 100644 --- a/sys/sys/buf.h +++ b/sys/sys/buf.h @@ -1,4 +1,4 @@ -/* $OpenBSD: buf.h,v 1.33 2001/11/15 23:15:15 art Exp $ */ +/* $OpenBSD: buf.h,v 1.34 2001/11/27 05:27:12 art Exp $ */ /* $NetBSD: buf.h,v 1.25 1997/04/09 21:12:17 mycroft Exp $ */ /* @@ -68,6 +68,7 @@ extern struct bio_ops { void (*io_deallocate) __P((struct buf *)); void (*io_movedeps) __P((struct buf *, struct buf *)); int (*io_countdeps) __P((struct buf *, int, int)); + void (*io_pageiodone) __P((struct buf *)); } bioops; /* @@ -96,10 +97,7 @@ struct buf { /* Function to call upon completion. */ void (*b_iodone) __P((struct buf *)); struct vnode *b_vp; /* Device vnode. */ - int b_dirtyoff; /* Offset in buffer of dirty region. */ - int b_dirtyend; /* Offset of end of dirty region. */ - int b_validoff; /* Offset in buffer of valid region. */ - int b_validend; /* Offset of end of valid region. */ + void *b_private; struct workhead b_dep; /* List of filesystem dependencies. */ }; @@ -120,7 +118,6 @@ struct buf { * These flags are kept in b_flags. */ #define B_AGE 0x00000001 /* Move to age queue when I/O done. */ -#define B_NEEDCOMMIT 0x00000002 /* Needs committing to stable storage */ #define B_ASYNC 0x00000004 /* Start I/O, do not wait. */ #define B_BAD 0x00000008 /* Bad block revectoring in progress. */ #define B_BUSY 0x00000010 /* I/O in progress. */ @@ -144,7 +141,6 @@ struct buf { #define B_UAREA 0x00400000 /* Buffer describes Uarea I/O. */ #define B_WANTED 0x00800000 /* Process wants this buffer. */ #define B_WRITE 0x00000000 /* Write buffer (pseudo flag). */ -#define B_WRITEINPROG 0x01000000 /* Write in progress. */ #define B_XXX 0x02000000 /* Debugging flag. */ #define B_DEFERRED 0x04000000 /* Skipped over for cleaning */ #define B_SCANNED 0x08000000 /* Block already pushed during sync */ diff --git a/sys/sys/mount.h b/sys/sys/mount.h index f398a301c69..50f59e4a532 100644 --- a/sys/sys/mount.h +++ b/sys/sys/mount.h @@ -1,4 +1,4 @@ -/* $OpenBSD: mount.h,v 1.40 2001/11/21 21:13:34 csapuntz Exp $ */ +/* $OpenBSD: mount.h,v 1.41 2001/11/27 05:27:12 art Exp $ */ /* $NetBSD: mount.h,v 1.48 1996/02/18 11:55:47 fvdl Exp $ */ /* @@ -336,6 +336,8 @@ struct mount { struct lock mnt_lock; /* mount structure lock */ int mnt_flag; /* flags */ int mnt_maxsymlinklen; /* max size of short symlink */ + int mnt_fs_bshift; /* offset shift for lblkno */ + int mnt_dev_bshift; /* shift for device sectors */ struct statfs mnt_stat; /* cache of filesystem stats */ qaddr_t mnt_data; /* private data */ }; diff --git a/sys/sys/param.h b/sys/sys/param.h index a950b196cc3..59fe3a01548 100644 --- a/sys/sys/param.h +++ b/sys/sys/param.h @@ -1,4 +1,4 @@ -/* $OpenBSD: param.h,v 1.41 2001/09/11 13:11:18 deraadt Exp $ */ +/* $OpenBSD: param.h,v 1.42 2001/11/27 05:27:12 art Exp $ */ /* $NetBSD: param.h,v 1.23 1996/03/17 01:02:29 thorpej Exp $ */ /*- @@ -227,3 +227,16 @@ #define RFCNAMEG (1<<10) /* UNIMPL zero plan9 `name space' */ #define RFCENVG (1<<11) /* UNIMPL zero plan9 `env space' */ #define RFCFDG (1<<12) /* zero fd table */ + +#ifdef _KERNEL +/* + * Defaults for Unified Buffer Cache parameters. + */ + +#ifndef UBC_WINSIZE +#define UBC_WINSIZE 8192 +#endif +#ifndef UBC_NWINS +#define UBC_NWINS 1024 +#endif +#endif /* _KERNEL */ diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h index 64a90990d0e..9eaf484201f 100644 --- a/sys/sys/vnode.h +++ b/sys/sys/vnode.h @@ -1,4 +1,4 @@ -/* $OpenBSD: vnode.h,v 1.41 2001/11/15 06:22:30 art Exp $ */ +/* $OpenBSD: vnode.h,v 1.42 2001/11/27 05:27:12 art Exp $ */ /* $NetBSD: vnode.h,v 1.38 1996/02/29 20:59:05 cgd Exp $ */ /* @@ -90,8 +90,10 @@ struct vnode { struct uvm_vnode v_uvm; /* uvm data */ int (**v_op) __P((void *)); /* vnode operations vector */ enum vtype v_type; /* vnode type */ - u_int v_flag; /* vnode flags (see below) */ - u_int v_usecount; /* reference count of users */ +#define v_flag v_uvm.u_flags +#define v_usecount v_uvm.u_obj.uo_refs +#define v_interlock v_uvm.u_obj.vmobjlock +#define v_numoutput v_uvm.u_nio /* reference count of writers */ u_int v_writecount; /* Flags that can be read/written in interrupts */ @@ -103,7 +105,6 @@ struct vnode { LIST_ENTRY(vnode) v_mntvnodes; /* vnodes for mount point */ struct buflists v_cleanblkhd; /* clean blocklist head */ struct buflists v_dirtyblkhd; /* dirty blocklist head */ - u_int v_numoutput; /* num of writes in progress */ LIST_ENTRY(vnode) v_synclist; /* vnode with dirty buffers */ union { struct mount *vu_mountedhere;/* ptr to mounted vfs (VDIR) */ @@ -112,8 +113,8 @@ struct vnode { struct fifoinfo *vu_fifoinfo; /* fifo (VFIFO) */ } v_un; - struct simplelock v_interlock; /* lock on usecount and flag */ struct lock *v_vnlock; /* used for non-locking fs's */ + struct lock v_glock; /* getpage lock */ enum vtagtype v_tag; /* type of underlying data */ void *v_data; /* private data for fs */ struct { @@ -137,6 +138,9 @@ struct vnode { #define VXWANT 0x0200 /* process is waiting for vnode */ #define VALIASED 0x0800 /* vnode has an alias */ #define VLOCKSWORK 0x4000 /* FS supports locking discipline */ +#define VDIRTY 0x8000 /* vnode possibly has dirty pages */ + +#define VSIZENOTSET ((voff_t)-1) /* * (v_bioflag) Flags that may be manipulated by interrupt handlers @@ -446,6 +450,12 @@ int vop_generic_unlock __P((void *)); int vop_generic_revoke __P((void *)); int vop_generic_kqfilter __P((void *)); +/* XXXUBC - doesn't really belong here. */ +int genfs_getpages __P((void *)); +int genfs_putpages __P((void *)); +int genfs_size __P((void *)); + + int vn_stat __P((struct vnode *vp, struct stat *sb, struct proc *p)); int vn_statfile __P((struct file *fp, struct stat *sb, struct proc *p)); int vn_writechk __P((struct vnode *vp)); diff --git a/sys/sys/vnode_if.h b/sys/sys/vnode_if.h index 00cdadabe25..57aff6f4c97 100644 --- a/sys/sys/vnode_if.h +++ b/sys/sys/vnode_if.h @@ -3,7 +3,7 @@ * (Modifications made here may easily be lost!) * * Created from the file: - * OpenBSD: vnode_if.src,v 1.11 2001/06/23 02:21:05 csapuntz Exp + * OpenBSD: vnode_if.src,v 1.13 2001/07/26 20:24:47 millert Exp * by the script: * OpenBSD: vnode_if.sh,v 1.8 2001/02/26 17:34:18 art Exp */ @@ -397,6 +397,52 @@ struct vop_whiteout_args { extern struct vnodeop_desc vop_whiteout_desc; int VOP_WHITEOUT __P((struct vnode *, struct componentname *, int)); +struct vop_ballocn_args { + struct vnodeop_desc *a_desc; + struct vnode *a_vp; + off_t a_offset; + off_t a_length; + struct ucred *a_cred; + int a_flags; +}; +extern struct vnodeop_desc vop_ballocn_desc; +int VOP_BALLOCN __P((struct vnode *, off_t, off_t, struct ucred *, int)); + +struct vop_getpages_args { + struct vnodeop_desc *a_desc; + struct vnode *a_vp; + voff_t a_offset; + vm_page_t *a_m; + int *a_count; + int a_centeridx; + vm_prot_t a_access_type; + int a_advice; + int a_flags; +}; +extern struct vnodeop_desc vop_getpages_desc; +int VOP_GETPAGES __P((struct vnode *, voff_t, vm_page_t *, int *, int, + vm_prot_t, int, int)); + +struct vop_putpages_args { + struct vnodeop_desc *a_desc; + struct vnode *a_vp; + vm_page_t *a_m; + int a_count; + int a_flags; + int *a_rtvals; +}; +extern struct vnodeop_desc vop_putpages_desc; +int VOP_PUTPAGES __P((struct vnode *, vm_page_t *, int, int, int *)); + +struct vop_size_args { + struct vnodeop_desc *a_desc; + struct vnode *a_vp; + off_t a_size; + off_t *a_eobp; +}; +extern struct vnodeop_desc vop_size_desc; +int VOP_SIZE __P((struct vnode *, off_t, off_t *)); + /* Special cases: */ #include <sys/buf.h> diff --git a/sys/ufs/ext2fs/ext2fs_balloc.c b/sys/ufs/ext2fs/ext2fs_balloc.c index 849a8864b2a..78fb0a8371c 100644 --- a/sys/ufs/ext2fs/ext2fs_balloc.c +++ b/sys/ufs/ext2fs/ext2fs_balloc.c @@ -1,5 +1,4 @@ -/* $OpenBSD: ext2fs_balloc.c,v 1.7 2001/11/06 19:53:21 miod Exp $ */ -/* $NetBSD: ext2fs_balloc.c,v 1.10 2001/07/04 21:16:01 chs Exp $ */ +/* $NetBSD: ext2fs_balloc.c,v 1.8 2000/12/10 06:38:31 chs Exp $ */ /* * Copyright (c) 1997 Manuel Bouyer. @@ -44,8 +43,9 @@ #include <sys/proc.h> #include <sys/file.h> #include <sys/vnode.h> +#include <sys/mount.h> -#include <uvm/uvm_extern.h> +#include <uvm/uvm.h> #include <ufs/ufs/quota.h> #include <ufs/ufs/inode.h> @@ -73,8 +73,13 @@ ext2fs_buf_alloc(struct inode *ip, daddr_t bn, int size, struct ucred *cred, u_int deallocated; ufs_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1]; int unwindidx = -1; + UVMHIST_FUNC("ext2fs_buf_alloc"); UVMHIST_CALLED(ubchist); - *bpp = NULL; + UVMHIST_LOG(ubchist, "bn 0x%x", bn,0,0,0); + + if (bpp != NULL) { + *bpp = NULL; + } if (bn < 0) return (EFBIG); fs = ip->i_e2fs; @@ -86,20 +91,29 @@ ext2fs_buf_alloc(struct inode *ip, daddr_t bn, int size, struct ucred *cred, if (bn < NDADDR) { nb = fs2h32(ip->i_e2fs_blocks[bn]); if (nb != 0) { - error = bread(vp, bn, fs->e2fs_bsize, NOCRED, &bp); - if (error) { - brelse(bp); - return (error); + + /* + * the block is already allocated, just read it. + */ + + if (bpp != NULL) { + error = bread(vp, bn, fs->e2fs_bsize, NOCRED, + &bp); + if (error) { + brelse(bp); + return (error); + } + *bpp = bp; } - *bpp = bp; return (0); } /* * allocate a new direct block. */ + error = ext2fs_alloc(ip, bn, - ext2fs_blkpref(ip, bn, (int)bn, &ip->i_e2fs_blocks[0]), + ext2fs_blkpref(ip, bn, bn, &ip->i_e2fs_blocks[0]), cred, &newb); if (error) return (error); @@ -107,11 +121,13 @@ ext2fs_buf_alloc(struct inode *ip, daddr_t bn, int size, struct ucred *cred, ip->i_e2fs_last_blk = newb; ip->i_e2fs_blocks[bn] = h2fs32(newb); ip->i_flag |= IN_CHANGE | IN_UPDATE; - bp = getblk(vp, bn, fs->e2fs_bsize, 0, 0); - bp->b_blkno = fsbtodb(fs, newb); - if (flags & B_CLRBUF) - clrbuf(bp); - *bpp = bp; + if (bpp != NULL) { + bp = getblk(vp, bn, fs->e2fs_bsize, 0, 0); + bp->b_blkno = fsbtodb(fs, newb); + if (flags & B_CLRBUF) + clrbuf(bp); + *bpp = bp; + } return (0); } /* @@ -229,26 +245,30 @@ ext2fs_buf_alloc(struct inode *ip, daddr_t bn, int size, struct ucred *cred, } else { bdwrite(bp); } - nbp = getblk(vp, lbn, fs->e2fs_bsize, 0, 0); - nbp->b_blkno = fsbtodb(fs, nb); - if (flags & B_CLRBUF) - clrbuf(nbp); - *bpp = nbp; + if (bpp != NULL) { + nbp = getblk(vp, lbn, fs->e2fs_bsize, 0, 0); + nbp->b_blkno = fsbtodb(fs, nb); + if (flags & B_CLRBUF) + clrbuf(nbp); + *bpp = nbp; + } return (0); } brelse(bp); - if (flags & B_CLRBUF) { - error = bread(vp, lbn, (int)fs->e2fs_bsize, NOCRED, &nbp); - if (error) { - brelse(nbp); - goto fail; + if (bpp != NULL) { + if (flags & B_CLRBUF) { + error = bread(vp, lbn, (int)fs->e2fs_bsize, NOCRED, + &nbp); + if (error) { + brelse(nbp); + goto fail; + } + } else { + nbp = getblk(vp, lbn, fs->e2fs_bsize, 0, 0); + nbp->b_blkno = fsbtodb(fs, nb); } - } else { - nbp = getblk(vp, lbn, fs->e2fs_bsize, 0, 0); - nbp->b_blkno = fsbtodb(fs, nb); + *bpp = nbp; } - - *bpp = nbp; return (0); fail: /* @@ -292,3 +312,153 @@ fail: } return error; } + +int +ext2fs_ballocn(v) + void *v; +{ + struct vop_ballocn_args /* { + struct vnode *a_vp; + off_t a_offset; + off_t a_length; + struct ucred *a_cred; + int a_flags; + } */ *ap = v; + off_t off, len; + struct vnode *vp = ap->a_vp; + struct inode *ip = VTOI(vp); + struct m_ext2fs *fs = ip->i_e2fs; + int error, delta, bshift, bsize; + UVMHIST_FUNC("ext2fs_ballocn"); UVMHIST_CALLED(ubchist); + + bshift = fs->e2fs_bshift; + bsize = 1 << bshift; + + off = ap->a_offset; + len = ap->a_length; + + delta = off & (bsize - 1); + off -= delta; + len += delta; + + while (len > 0) { + bsize = min(bsize, len); + UVMHIST_LOG(ubchist, "off 0x%x len 0x%x bsize 0x%x", + off, len, bsize, 0); + + error = ext2fs_buf_alloc(ip, lblkno(fs, off), bsize, ap->a_cred, + NULL, ap->a_flags); + if (error) { + UVMHIST_LOG(ubchist, "error %d", error, 0,0,0); + return error; + } + + /* + * increase file size now, VOP_BALLOC() requires that + * EOF be up-to-date before each call. + */ + + if (ip->i_e2fs_size < off + bsize) { + UVMHIST_LOG(ubchist, "old 0x%x new 0x%x", + ip->i_e2fs_size, off + bsize,0,0); + ip->i_e2fs_size = off + bsize; + if (vp->v_uvm.u_size < ip->i_e2fs_size) { + uvm_vnp_setsize(vp, ip->i_e2fs_size); + } + } + + off += bsize; + len -= bsize; + } + return 0; +} + +/* + * allocate a range of blocks in a file. + * after this function returns, any page entirely contained within the range + * will map to invalid data and thus must be overwritten before it is made + * accessible to others. + */ + +int +ext2fs_balloc_range(vp, off, len, cred, flags) + struct vnode *vp; + off_t off, len; + struct ucred *cred; + int flags; +{ + off_t oldeof, eof, pagestart; + struct uvm_object *uobj; + int i, delta, error, npages; + int bshift = vp->v_mount->mnt_fs_bshift; + int bsize = 1 << bshift; + int ppb = max(bsize >> PAGE_SHIFT, 1); + struct vm_page *pgs[ppb]; + UVMHIST_FUNC("ext2fs_balloc_range"); UVMHIST_CALLED(ubchist); + UVMHIST_LOG(ubchist, "vp %p off 0x%x len 0x%x u_size 0x%x", + vp, off, len, vp->v_uvm.u_size); + + error = 0; + uobj = &vp->v_uvm.u_obj; + oldeof = vp->v_uvm.u_size; + eof = max(oldeof, off + len); + UVMHIST_LOG(ubchist, "new eof 0x%x", eof,0,0,0); + pgs[0] = NULL; + + /* + * cache the new range of the file. this will create zeroed pages + * where the new block will be and keep them locked until the + * new block is allocated, so there will be no window where + * the old contents of the new block is visible to racing threads. + */ + + pagestart = trunc_page(off) & ~(bsize - 1); + npages = min(ppb, (round_page(eof) - pagestart) >> PAGE_SHIFT); + memset(pgs, 0, npages); + simple_lock(&uobj->vmobjlock); + error = VOP_GETPAGES(vp, pagestart, pgs, &npages, 0, + VM_PROT_READ, 0, PGO_SYNCIO | PGO_PASTEOF); + if (error) { + UVMHIST_LOG(ubchist, "getpages %d", error,0,0,0); + goto errout; + } + for (i = 0; i < npages; i++) { + UVMHIST_LOG(ubchist, "got pgs[%d] %p", i, pgs[i],0,0); + KASSERT((pgs[i]->flags & PG_RELEASED) == 0); + pgs[i]->flags &= ~PG_CLEAN; + uvm_pageactivate(pgs[i]); + } + + /* + * adjust off to be block-aligned. + */ + + delta = off & (bsize - 1); + off -= delta; + len += delta; + + /* + * now allocate the range. + */ + + lockmgr(&vp->v_glock, LK_EXCLUSIVE, NULL, curproc); + error = VOP_BALLOCN(vp, off, len, cred, flags); + UVMHIST_LOG(ubchist, "ballocn %d", error,0,0,0); + lockmgr(&vp->v_glock, LK_RELEASE, NULL, curproc); + + /* + * unbusy any pages we are holding. + */ + +errout: + simple_lock(&uobj->vmobjlock); + if (error) { + (void) (uobj->pgops->pgo_flush)(uobj, oldeof, pagestart + ppb, + PGO_FREE); + } + if (pgs[0] != NULL) { + uvm_page_unbusy(pgs, npages); + } + simple_unlock(&uobj->vmobjlock); + return (error); +} diff --git a/sys/ufs/ext2fs/ext2fs_extern.h b/sys/ufs/ext2fs/ext2fs_extern.h index b7a3f96df38..af23fb6ef2d 100644 --- a/sys/ufs/ext2fs/ext2fs_extern.h +++ b/sys/ufs/ext2fs/ext2fs_extern.h @@ -1,5 +1,5 @@ -/* $OpenBSD: ext2fs_extern.h,v 1.10 2001/09/18 00:39:15 art Exp $ */ -/* $NetBSD: ext2fs_extern.h,v 1.1 1997/06/11 09:33:55 bouyer Exp $ */ +/* $OpenBSD: ext2fs_extern.h,v 1.11 2001/11/27 05:27:12 art Exp $ */ +/* $NetBSD: ext2fs_extern.h,v 1.9 2000/11/27 08:39:53 chs Exp $ */ /*- * Copyright (c) 1997 Manuel Bouyer. @@ -74,6 +74,9 @@ int ext2fs_inode_free(struct inode *pip, ino_t ino, int mode); /* ext2fs_balloc.c */ int ext2fs_buf_alloc(struct inode *, daddr_t, int, struct ucred *, struct buf **, int); +int ext2fs_ballocn __P((void *)); +int ext2fs_balloc_range __P((struct vnode *, off_t, off_t, struct ucred *, + int)); /* ext2fs_bmap.c */ int ext2fs_bmap __P((void *)); diff --git a/sys/ufs/ext2fs/ext2fs_inode.c b/sys/ufs/ext2fs/ext2fs_inode.c index 4af28d9bf0e..f77c99c47b5 100644 --- a/sys/ufs/ext2fs/ext2fs_inode.c +++ b/sys/ufs/ext2fs/ext2fs_inode.c @@ -1,5 +1,4 @@ -/* $OpenBSD: ext2fs_inode.c,v 1.17 2001/11/06 19:53:21 miod Exp $ */ -/* $NetBSD: ext2fs_inode.c,v 1.24 2001/06/19 12:59:18 wiz Exp $ */ +/* $NetBSD: ext2fs_inode.c,v 1.23 2001/02/18 20:17:04 chs Exp $ */ /* * Copyright (c) 1997 Manuel Bouyer. @@ -59,8 +58,10 @@ #include <ufs/ext2fs/ext2fs.h> #include <ufs/ext2fs/ext2fs_extern.h> +extern int prtactive; + static int ext2fs_indirtrunc __P((struct inode *, ufs_daddr_t, ufs_daddr_t, - ufs_daddr_t, int, long *)); + ufs_daddr_t, int, long *)); /* * Last reference to an inode. If necessary, write or delete it. @@ -78,7 +79,6 @@ ext2fs_inactive(v) struct proc *p = ap->a_p; struct timespec ts; int error = 0; - extern int prtactive; if (prtactive && vp->v_usecount != 0) vprint("ext2fs_inactive: pushing active", vp); @@ -171,14 +171,13 @@ ext2fs_truncate(struct inode *oip, off_t length, int flags, struct ucred *cred) { struct vnode *ovp = ITOV(oip); ufs_daddr_t lastblock; - ufs_daddr_t bn, lbn, lastiblock[NIADDR], indir_lbn[NIADDR]; + ufs_daddr_t bn, lastiblock[NIADDR], indir_lbn[NIADDR]; ufs_daddr_t oldblks[NDADDR + NIADDR], newblks[NDADDR + NIADDR]; struct m_ext2fs *fs; - struct buf *bp; int offset, size, level; long count, nblocks, vflags, blocksreleased = 0; int i; - int aflags, error, allerror; + int error, allerror; off_t osize; if (length < 0) @@ -219,22 +218,8 @@ ext2fs_truncate(struct inode *oip, off_t length, int flags, struct ucred *cred) if (length > fs->fs_maxfilesize) return (EFBIG); #endif - offset = blkoff(fs, length - 1); - lbn = lblkno(fs, length - 1); - aflags = B_CLRBUF; - if (flags & IO_SYNC) - aflags |= B_SYNC; - error = ext2fs_buf_alloc(oip, lbn, offset + 1, cred, &bp, - aflags); - if (error) - return (error); - oip->i_e2fs_size = length; - uvm_vnp_setsize(ovp, length); - uvm_vnp_uncache(ovp); - if (aflags & B_SYNC) - bwrite(bp); - else - bawrite(bp); + ext2fs_balloc_range(ovp, length - 1, 1, cred, + flags & IO_SYNC ? B_SYNC : 0); oip->i_flag |= IN_CHANGE | IN_UPDATE; return (ext2fs_update(oip, NULL, NULL, 1)); } @@ -246,28 +231,15 @@ ext2fs_truncate(struct inode *oip, off_t length, int flags, struct ucred *cred) * of subsequent file growth. */ offset = blkoff(fs, length); - if (offset == 0) { - oip->i_e2fs_size = length; - } else { - lbn = lblkno(fs, length); - aflags = B_CLRBUF; - if (flags & IO_SYNC) - aflags |= B_SYNC; - error = ext2fs_buf_alloc(oip, lbn, offset, cred, &bp, - aflags); - if (error) - return (error); - oip->i_e2fs_size = length; + if (offset != 0) { size = fs->e2fs_bsize; - uvm_vnp_setsize(ovp, length); - uvm_vnp_uncache(ovp); - bzero((char *)bp->b_data + offset, (u_int)(size - offset)); - allocbuf(bp, size); - if (aflags & B_SYNC) - bwrite(bp); - else - bawrite(bp); + + /* XXXUBC we should handle more than just VREG */ + uvm_vnp_zerorange(ovp, length, size - offset); } + oip->i_e2fs_size = length; + uvm_vnp_setsize(ovp, length); + /* * Calculate index into inode's block list of * last direct and indirect blocks (if any) diff --git a/sys/ufs/ext2fs/ext2fs_readwrite.c b/sys/ufs/ext2fs/ext2fs_readwrite.c index 9ae4322756f..94424055733 100644 --- a/sys/ufs/ext2fs/ext2fs_readwrite.c +++ b/sys/ufs/ext2fs/ext2fs_readwrite.c @@ -79,6 +79,8 @@ ext2fs_read(v) struct uio *uio; struct m_ext2fs *fs; struct buf *bp; + void *win; + vsize_t bytelen; ufs_daddr_t lbn, nextlbn; off_t bytesinfile; long size, xfersize, blkoffset; @@ -107,6 +109,27 @@ ext2fs_read(v) if (uio->uio_resid == 0) return (0); + if (vp->v_type == VREG) { + error = 0; + while (uio->uio_resid > 0) { + + bytelen = MIN(ip->i_e2fs_size - uio->uio_offset, + uio->uio_resid); + + if (bytelen == 0) { + break; + } + win = ubc_alloc(&vp->v_uvm.u_obj, uio->uio_offset, + &bytelen, UBC_READ); + error = uiomove(win, bytelen, uio); + ubc_release(win, 0); + if (error) { + break; + } + } + goto out; + } + for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { if ((bytesinfile = ip->i_e2fs_size - uio->uio_offset) <= 0) break; @@ -156,8 +179,11 @@ ext2fs_read(v) if (bp != NULL) brelse(bp); +out: if (!(vp->v_mount->mnt_flag & MNT_NOATIME)) { ip->i_flag |= IN_ACCESS; + if ((ap->a_ioflag & IO_SYNC) == IO_SYNC) + error = ext2fs_update(ip, NULL, NULL, 1); } return (error); } @@ -183,12 +209,17 @@ ext2fs_write(v) struct proc *p; ufs_daddr_t lbn; off_t osize; - int blkoffset, error, flags, ioflag, resid, size, xfersize; + int blkoffset, error, flags, ioflag, resid, xfersize; + vsize_t bytelen; + void *win; + off_t oldoff; + boolean_t rv; ioflag = ap->a_ioflag; uio = ap->a_uio; vp = ap->a_vp; ip = VTOI(vp); + error = 0; #ifdef DIAGNOSTIC if (uio->uio_rw != UIO_WRITE) @@ -232,35 +263,65 @@ ext2fs_write(v) resid = uio->uio_resid; osize = ip->i_e2fs_size; - flags = ioflag & IO_SYNC ? B_SYNC : 0; + if (vp->v_type == VREG) { + while (uio->uio_resid > 0) { + oldoff = uio->uio_offset; + blkoffset = blkoff(fs, uio->uio_offset); + bytelen = MIN(fs->e2fs_bsize - blkoffset, + uio->uio_resid); + + /* + * XXXUBC if file is mapped and this is the last block, + * process one page at a time. + */ + + error = ext2fs_balloc_range(vp, uio->uio_offset, + bytelen, ap->a_cred, 0); + if (error) { + break; + } + win = ubc_alloc(&vp->v_uvm.u_obj, uio->uio_offset, + &bytelen, UBC_WRITE); + error = uiomove(win, bytelen, uio); + ubc_release(win, 0); + if (error) { + break; + } + + /* + * flush what we just wrote if necessary. + * XXXUBC simplistic async flushing. + */ + + if (oldoff >> 16 != uio->uio_offset >> 16) { + simple_lock(&vp->v_uvm.u_obj.vmobjlock); + rv = vp->v_uvm.u_obj.pgops->pgo_flush( + &vp->v_uvm.u_obj, (oldoff >> 16) << 16, + (uio->uio_offset >> 16) << 16, PGO_CLEANIT); + simple_unlock(&vp->v_uvm.u_obj.vmobjlock); + } + } + goto out; + } + + flags = ioflag & IO_SYNC ? B_SYNC : 0; for (error = 0; uio->uio_resid > 0;) { lbn = lblkno(fs, uio->uio_offset); blkoffset = blkoff(fs, uio->uio_offset); - xfersize = fs->e2fs_bsize - blkoffset; - if (uio->uio_resid < xfersize) - xfersize = uio->uio_resid; - if (fs->e2fs_bsize > xfersize) + xfersize = MIN(fs->e2fs_bsize - blkoffset, uio->uio_resid); + if (xfersize < fs->e2fs_bsize) flags |= B_CLRBUF; else flags &= ~B_CLRBUF; - error = ext2fs_buf_alloc(ip, - lbn, blkoffset + xfersize, ap->a_cred, &bp, flags); + lbn, blkoffset + xfersize, ap->a_cred, &bp, flags); if (error) break; - if (uio->uio_offset + xfersize > ip->i_e2fs_size) { + if (ip->i_e2fs_size < uio->uio_offset + xfersize) { ip->i_e2fs_size = uio->uio_offset + xfersize; - uvm_vnp_setsize(vp, ip->i_e2fs_size); } - uvm_vnp_uncache(vp); - - size = fs->e2fs_bsize - bp->b_resid; - if (size < xfersize) - xfersize = size; - - error = - uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); + error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio); if (ioflag & IO_SYNC) (void)bwrite(bp); else if (xfersize + blkoffset == fs->e2fs_bsize) { @@ -272,13 +333,14 @@ ext2fs_write(v) bdwrite(bp); if (error || xfersize == 0) break; - ip->i_flag |= IN_CHANGE | IN_UPDATE; } /* * If we successfully wrote any data, and we are not the superuser * we clear the setuid and setgid bits as a precaution against * tampering. */ +out: + ip->i_flag |= IN_CHANGE | IN_UPDATE; if (resid > uio->uio_resid && ap->a_cred && ap->a_cred->cr_uid != 0) ip->i_e2fs_mode &= ~(ISUID | ISGID); if (error) { @@ -288,8 +350,7 @@ ext2fs_write(v) uio->uio_offset -= resid - uio->uio_resid; uio->uio_resid = resid; } - } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) { + } else if (resid > uio->uio_resid && (ioflag & IO_SYNC) == IO_SYNC) error = ext2fs_update(ip, NULL, NULL, 1); - } return (error); } diff --git a/sys/ufs/ext2fs/ext2fs_subr.c b/sys/ufs/ext2fs/ext2fs_subr.c index 82165b8f242..3263f7e5391 100644 --- a/sys/ufs/ext2fs/ext2fs_subr.c +++ b/sys/ufs/ext2fs/ext2fs_subr.c @@ -1,5 +1,4 @@ -/* $OpenBSD: ext2fs_subr.c,v 1.6 2001/09/18 01:39:13 art Exp $ */ -/* $NetBSD: ext2fs_subr.c,v 1.1 1997/06/11 09:34:03 bouyer Exp $ */ +/* $NetBSD: ext2fs_subr.c,v 1.4 2000/03/30 12:41:11 augustss Exp $ */ /* * Copyright (c) 1997 Manuel Bouyer. @@ -96,7 +95,7 @@ ext2fs_checkoverlap(bp, ip) if (ep == bp || (ep->b_flags & B_INVAL) || ep->b_vp == NULLVP) continue; - if (VOP_BMAP(ep->b_vp, (daddr_t)0, &vp, (daddr_t)0, NULL)) + if (VOP_BMAP(ep->b_vp, (ufs_daddr_t)0, &vp, (ufs_daddr_t)0, NULL)) continue; if (vp != ip->i_devvp) continue; diff --git a/sys/ufs/ext2fs/ext2fs_vfsops.c b/sys/ufs/ext2fs/ext2fs_vfsops.c index 6991cf9d650..e438268acbc 100644 --- a/sys/ufs/ext2fs/ext2fs_vfsops.c +++ b/sys/ufs/ext2fs/ext2fs_vfsops.c @@ -1,5 +1,5 @@ -/* $OpenBSD: ext2fs_vfsops.c,v 1.16 2001/11/21 22:21:48 csapuntz Exp $ */ -/* $NetBSD: ext2fs_vfsops.c,v 1.1 1997/06/11 09:34:07 bouyer Exp $ */ +/* $OpenBSD: ext2fs_vfsops.c,v 1.17 2001/11/27 05:27:12 art Exp $ */ +/* $NetBSD: ext2fs_vfsops.c,v 1.40 2000/11/27 08:39:53 chs Exp $ */ /* * Copyright (c) 1997 Manuel Bouyer. @@ -402,9 +402,11 @@ ext2fs_reload(mountp, cred, p) * Step 1: invalidate all cached meta-data. */ devvp = VFSTOUFS(mountp)->um_devvp; - if (vinvalbuf(devvp, 0, cred, p, 0, 0)) + vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY, p); + error = vinvalbuf(devvp, 0, cred, p, 0, 0); + VOP_UNLOCK(devvp, 0, p); + if (error) panic("ext2fs_reload: dirty1"); - /* * Step 2: re-read superblock from disk. */ @@ -583,14 +585,18 @@ ext2fs_mountfs(devvp, mp, p) mp->mnt_stat.f_fsid.val[1] = mp->mnt_vfc->vfc_typenum; mp->mnt_maxsymlinklen = EXT2_MAXSYMLINKLEN; mp->mnt_flag |= MNT_LOCAL; + mp->mnt_dev_bshift = DEV_BSHIFT; /* XXX */ + mp->mnt_fs_bshift = m_fs->e2fs_bshift; ump->um_mountp = mp; ump->um_dev = dev; ump->um_devvp = devvp; ump->um_nindir = NINDIR(m_fs); + ump->um_lognindir = ffs(NINDIR(m_fs)) - 1; ump->um_bptrtodb = m_fs->e2fs_fsbtodb; ump->um_seqinc = 1; /* no frags */ devvp->v_specmountpoint = mp; return (0); + out: if (bp) brelse(bp); @@ -924,6 +930,7 @@ ext2fs_vget(mp, ino, vpp) ip->i_flag |= IN_MODIFIED; } + vp->v_uvm.u_size = ip->i_e2fs_size; *vpp = vp; return (0); } diff --git a/sys/ufs/ext2fs/ext2fs_vnops.c b/sys/ufs/ext2fs/ext2fs_vnops.c index 0faba75ffd2..fffdd494d5a 100644 --- a/sys/ufs/ext2fs/ext2fs_vnops.c +++ b/sys/ufs/ext2fs/ext2fs_vnops.c @@ -1,5 +1,5 @@ -/* $OpenBSD: ext2fs_vnops.c,v 1.17 2001/11/06 19:53:21 miod Exp $ */ -/* $NetBSD: ext2fs_vnops.c,v 1.1 1997/06/11 09:34:09 bouyer Exp $ */ +/* $OpenBSD: ext2fs_vnops.c,v 1.18 2001/11/27 05:27:12 art Exp $ */ +/* $NetBSD: ext2fs_vnops.c,v 1.30 2000/11/27 08:39:53 chs Exp $ */ /* * Copyright (c) 1997 Manuel Bouyer. @@ -402,8 +402,6 @@ ext2fs_chmod(vp, mode, cred, p) ip->i_e2fs_mode &= ~ALLPERMS; ip->i_e2fs_mode |= (mode & ALLPERMS); ip->i_flag |= IN_CHANGE; - if ((vp->v_flag & VTEXT) && (ip->i_e2fs_mode & S_ISTXT) == 0) - (void) uvm_vnp_uncache(vp); return (0); } @@ -1469,7 +1467,11 @@ struct vnodeopv_entry_desc ext2fs_vnodeop_entries[] = { { &vop_pathconf_desc, ufs_pathconf }, /* pathconf */ { &vop_advlock_desc, ext2fs_advlock }, /* advlock */ { &vop_bwrite_desc, vop_generic_bwrite }, /* bwrite */ - { (struct vnodeop_desc*)NULL, (int(*) __P((void*)))NULL } + { &vop_ballocn_desc, ext2fs_ballocn }, + { &vop_getpages_desc, genfs_getpages }, + { &vop_putpages_desc, genfs_putpages }, + { &vop_size_desc, genfs_size }, + { NULL, NULL } }; struct vnodeopv_desc ext2fs_vnodeop_opv_desc = { &ext2fs_vnodeop_p, ext2fs_vnodeop_entries }; diff --git a/sys/ufs/ffs/ffs_alloc.c b/sys/ufs/ffs/ffs_alloc.c index 8ddf99405fc..a53d87828c3 100644 --- a/sys/ufs/ffs/ffs_alloc.c +++ b/sys/ufs/ffs/ffs_alloc.c @@ -1,4 +1,4 @@ -/* $OpenBSD: ffs_alloc.c,v 1.35 2001/11/21 21:23:56 csapuntz Exp $ */ +/* $OpenBSD: ffs_alloc.c,v 1.36 2001/11/27 05:27:12 art Exp $ */ /* $NetBSD: ffs_alloc.c,v 1.11 1996/05/11 18:27:09 mycroft Exp $ */ /* @@ -169,14 +169,15 @@ ffs_realloccg(ip, lbprev, bpref, osize, nsize, cred, bpp, blknop) struct buf **bpp; ufs_daddr_t *blknop; { - register struct fs *fs; - struct buf *bp = NULL; + struct fs *fs; + struct buf *bp; ufs_daddr_t quota_updated = 0; int cg, request, error; daddr_t bprev, bno; if (bpp != NULL) *bpp = NULL; + fs = ip->i_fs; #ifdef DIAGNOSTIC if ((u_int)osize > fs->fs_bsize || fragoff(fs, osize) != 0 || @@ -282,7 +283,6 @@ ffs_realloccg(ip, lbprev, bpref, osize, nsize, cred, bpp, blknop) if (bno <= 0) goto nospace; - (void) uvm_vnp_uncache(ITOV(ip)); if (!DOINGSOFTDEP(ITOV(ip))) ffs_blkfree(ip, bprev, (long)osize); if (nsize < request) @@ -362,7 +362,8 @@ ffs_reallocblks(v) struct indir start_ap[NIADDR + 1], end_ap[NIADDR + 1], *idp; int i, len, start_lvl, end_lvl, pref, ssize; - if (doreallocblks == 0) + /* XXXUBC - don't reallocblks for now */ + if (1 || doreallocblks == 0) return (ENOSPC); vp = ap->a_vp; diff --git a/sys/ufs/ffs/ffs_balloc.c b/sys/ufs/ffs/ffs_balloc.c index 009adc91ff9..5f6ddc3d94e 100644 --- a/sys/ufs/ffs/ffs_balloc.c +++ b/sys/ufs/ffs/ffs_balloc.c @@ -1,4 +1,4 @@ -/* $OpenBSD: ffs_balloc.c,v 1.18 2001/11/21 21:23:56 csapuntz Exp $ */ +/* $OpenBSD: ffs_balloc.c,v 1.19 2001/11/27 05:27:12 art Exp $ */ /* $NetBSD: ffs_balloc.c,v 1.3 1996/02/09 22:22:21 christos Exp $ */ /* @@ -402,3 +402,61 @@ fail: return (error); } + +int +ffs_ballocn(v) + void *v; +{ + struct vop_ballocn_args /* { + struct vnode *a_vp; + off_t a_offset; + off_t a_length; + struct ucred *a_cred; + int a_flags; + } */ *ap = v; + + off_t off, len; + struct vnode *vp = ap->a_vp; + struct inode *ip = VTOI(vp); + struct fs *fs = ip->i_fs; + int error, delta, bshift, bsize; + + error = 0; + bshift = fs->fs_bshift; + bsize = 1 << bshift; + + off = ap->a_offset; + len = ap->a_length; + + delta = off & (bsize - 1); + off -= delta; + len += delta; + + while (len > 0) { + bsize = min(bsize, len); + + error = ffs_balloc(ip, off, bsize, ap->a_cred, ap->a_flags, + NULL); + if (error) { + goto out; + } + + /* + * increase file size now, VOP_BALLOC() requires that + * EOF be up-to-date before each call. + */ + + if (ip->i_ffs_size < off + bsize) { + ip->i_ffs_size = off + bsize; + if (vp->v_uvm.u_size < ip->i_ffs_size) { + uvm_vnp_setsize(vp, ip->i_ffs_size); + } + } + + off += bsize; + len -= bsize; + } + +out: + return error; + } diff --git a/sys/ufs/ffs/ffs_extern.h b/sys/ufs/ffs/ffs_extern.h index eeeba209c69..2875a332a57 100644 --- a/sys/ufs/ffs/ffs_extern.h +++ b/sys/ufs/ffs/ffs_extern.h @@ -1,4 +1,4 @@ -/* $OpenBSD: ffs_extern.h,v 1.14 2001/11/13 00:10:56 art Exp $ */ +/* $OpenBSD: ffs_extern.h,v 1.15 2001/11/27 05:27:12 art Exp $ */ /* $NetBSD: ffs_extern.h,v 1.4 1996/02/09 22:22:22 christos Exp $ */ /*- @@ -87,6 +87,7 @@ void ffs_clusteracct __P((struct fs *, struct cg *, daddr_t, int)); /* ffs_balloc.c */ int ffs_balloc(struct inode *, off_t, int, struct ucred *, int, struct buf **); +int ffs_ballocn(void *); /* ffs_inode.c */ int ffs_init __P((struct vfsconf *)); @@ -128,7 +129,7 @@ int ffs_read __P((void *)); int ffs_write __P((void *)); int ffs_fsync __P((void *)); int ffs_reclaim __P((void *)); - +int ffs_size __P((void *)); /* * Soft dependency function prototypes. diff --git a/sys/ufs/ffs/ffs_inode.c b/sys/ufs/ffs/ffs_inode.c index c81c795b2ac..cddf6a368ca 100644 --- a/sys/ufs/ffs/ffs_inode.c +++ b/sys/ufs/ffs/ffs_inode.c @@ -1,4 +1,4 @@ -/* $OpenBSD: ffs_inode.c,v 1.25 2001/11/21 21:23:56 csapuntz Exp $ */ +/* $OpenBSD: ffs_inode.c,v 1.26 2001/11/27 05:27:12 art Exp $ */ /* $NetBSD: ffs_inode.c,v 1.10 1996/05/11 18:27:19 mycroft Exp $ */ /* @@ -150,14 +150,14 @@ ffs_truncate(struct inode *oip, off_t length, int flags, struct ucred *cred) { struct vnode *ovp; daddr_t lastblock; - daddr_t bn, lbn, lastiblock[NIADDR], indir_lbn[NIADDR]; + daddr_t bn, lastiblock[NIADDR], indir_lbn[NIADDR]; daddr_t oldblks[NDADDR + NIADDR], newblks[NDADDR + NIADDR]; struct fs *fs; - struct buf *bp; + struct proc *p = curproc; int offset, size, level; long count, nblocks, vflags, blocksreleased = 0; register int i; - int aflags, error, allerror; + int error, allerror; off_t osize; if (length < 0) @@ -188,10 +188,55 @@ ffs_truncate(struct inode *oip, off_t length, int flags, struct ucred *cred) if ((error = getinoquota(oip)) != 0) return (error); - uvm_vnp_setsize(ovp, length); + fs = oip->i_fs; + if (length > fs->fs_maxfilesize) + return (EFBIG); + osize = oip->i_ffs_size; oip->i_ci.ci_lasta = oip->i_ci.ci_clen = oip->i_ci.ci_cstart = oip->i_ci.ci_lastw = 0; + /* + * Lengthen the size of the file. We must ensure that the + * last byte of the file is allocated. Since the smallest + * value of osize is 0, length will be at least 1. + */ + + if (osize < length) { + ufs_balloc_range(ovp, length - 1, 1, cred, + flags & IO_SYNC ? B_SYNC : 0); + oip->i_flag |= IN_CHANGE | IN_UPDATE; + return (UFS_UPDATE(oip, 1)); + } + + /* + * When truncating a regular file down to a non-block-aligned size, + * we must zero the part of last block which is past the new EOF. + * We must synchronously flush the zeroed pages to disk + * since the new pages will be invalidated as soon as we + * inform the VM system of the new, smaller size. + * We must to this before acquiring the GLOCK, since fetching + * the pages will acquire the GLOCK internally. + * So there is a window where another thread could see a whole + * zeroed page past EOF, but that's life. + */ + + offset = blkoff(fs, length); + if (ovp->v_type == VREG && length < osize && offset != 0) { + struct uvm_object *uobj; + voff_t eoz; + + size = blksize(fs, oip, lblkno(fs, length)); + eoz = min(lblktosize(fs, lblkno(fs, length)) + size, osize); + uvm_vnp_zerorange(ovp, length, eoz - length); + uobj = &ovp->v_uvm.u_obj; + simple_lock(&uobj->vmobjlock); + uobj->pgops->pgo_flush(uobj, length, eoz, + PGO_CLEANIT|PGO_DEACTIVATE|PGO_SYNCIO); + simple_unlock(&ovp->v_uvm.u_obj.vmobjlock); + } + + lockmgr(&ovp->v_glock, LK_EXCLUSIVE, NULL, p); + if (DOINGSOFTDEP(ovp)) { if (length > 0 || softdep_slowdown(ovp)) { /* @@ -204,80 +249,29 @@ ffs_truncate(struct inode *oip, off_t length, int flags, struct ucred *cred) * so that it will have no data structures left. */ if ((error = VOP_FSYNC(ovp, cred, MNT_WAIT, - curproc)) != 0) + curproc)) != 0) { + lockmgr(&ovp->v_glock, LK_RELEASE, NULL, p); return (error); + } } else { + uvm_vnp_setsize(ovp, length); (void)ufs_quota_free_blocks(oip, oip->i_ffs_blocks, NOCRED); softdep_setup_freeblocks(oip, length); (void) vinvalbuf(ovp, 0, cred, curproc, 0, 0); + lockmgr(&ovp->v_glock, LK_RELEASE, NULL, p); oip->i_flag |= IN_CHANGE | IN_UPDATE; return (UFS_UPDATE(oip, 0)); } } - fs = oip->i_fs; - osize = oip->i_ffs_size; /* - * Lengthen the size of the file. We must ensure that the - * last byte of the file is allocated. Since the smallest - * value of osize is 0, length will be at least 1. + * Reduce the size of the file. */ - if (osize < length) { - if (length > fs->fs_maxfilesize) - return (EFBIG); - aflags = B_CLRBUF; - if (flags & IO_SYNC) - aflags |= B_SYNC; - error = UFS_BUF_ALLOC(oip, length - 1, 1, - cred, aflags, &bp); - if (error) - return (error); - oip->i_ffs_size = length; - uvm_vnp_setsize(ovp, length); - (void) uvm_vnp_uncache(ovp); - if (aflags & B_SYNC) - bwrite(bp); - else - bawrite(bp); - oip->i_flag |= IN_CHANGE | IN_UPDATE; - return (UFS_UPDATE(oip, MNT_WAIT)); - } + oip->i_ffs_size = length; uvm_vnp_setsize(ovp, length); /* - * Shorten the size of the file. If the file is not being - * truncated to a block boundary, the contents of the - * partial block following the end of the file must be - * zero'ed in case it ever becomes accessible again because - * of subsequent file growth. Directories however are not - * zero'ed as they should grow back initialized to empty. - */ - offset = blkoff(fs, length); - if (offset == 0) { - oip->i_ffs_size = length; - } else { - lbn = lblkno(fs, length); - aflags = B_CLRBUF; - if (flags & IO_SYNC) - aflags |= B_SYNC; - error = UFS_BUF_ALLOC(oip, length - 1, 1, - cred, aflags, &bp); - if (error) - return (error); - oip->i_ffs_size = length; - size = blksize(fs, oip, lbn); - (void) uvm_vnp_uncache(ovp); - if (ovp->v_type != VDIR) - bzero((char *)bp->b_data + offset, - (u_int)(size - offset)); - allocbuf(bp, size); - if (aflags & B_SYNC) - bwrite(bp); - else - bawrite(bp); - } - /* * Calculate index into inode's block list of * last direct and indirect blocks (if any) * which we want to keep. Lastblock is -1 when @@ -402,6 +396,7 @@ done: oip->i_ffs_blocks -= blocksreleased; if (oip->i_ffs_blocks < 0) /* sanity */ oip->i_ffs_blocks = 0; + lockmgr(&ovp->v_glock, LK_RELEASE, NULL, p); oip->i_flag |= IN_CHANGE; (void)ufs_quota_free_blocks(oip, blocksreleased, NOCRED); return (allerror); diff --git a/sys/ufs/ffs/ffs_softdep.c b/sys/ufs/ffs/ffs_softdep.c index 1d66094cc06..7a66eed4d8b 100644 --- a/sys/ufs/ffs/ffs_softdep.c +++ b/sys/ufs/ffs/ffs_softdep.c @@ -1,4 +1,4 @@ -/* $OpenBSD: ffs_softdep.c,v 1.25 2001/11/13 14:19:24 art Exp $ */ +/* $OpenBSD: ffs_softdep.c,v 1.26 2001/11/27 05:27:12 art Exp $ */ /* * Copyright 1998, 2000 Marshall Kirk McKusick. All Rights Reserved. * @@ -56,6 +56,7 @@ #include <sys/malloc.h> #include <sys/mount.h> #include <sys/proc.h> +#include <sys/pool.h> #include <sys/syslog.h> #include <sys/systm.h> #include <sys/vnode.h> @@ -69,6 +70,10 @@ #include <ufs/ffs/ffs_extern.h> #include <ufs/ufs/ufs_extern.h> +#include <uvm/uvm.h> +struct pool sdpcpool; +int softdep_lockedbufs; + #define STATIC /* @@ -109,6 +114,13 @@ extern char *memname[]; */ /* + * Definitions for page cache info hashtable. + */ +#define PCBPHASHSIZE 1024 +LIST_HEAD(, buf) pcbphashhead[PCBPHASHSIZE]; +#define PCBPHASH(vp, lbn) ((((vaddr_t)(vp) >> 8) ^ (lbn)) & (PCBPHASHSIZE - 1)) + +/* * Internal function prototypes. */ STATIC void softdep_error __P((char *, int)); @@ -160,6 +172,13 @@ STATIC void pause_timer __P((void *)); STATIC int request_cleanup __P((int, int)); STATIC int process_worklist_item __P((struct mount *, int)); STATIC void add_to_worklist __P((struct worklist *)); +STATIC struct buf *softdep_setup_pagecache __P((struct inode *, ufs_lbn_t, + long)); +STATIC void softdep_collect_pagecache __P((struct inode *)); +STATIC void softdep_free_pagecache __P((struct inode *)); +STATIC struct vnode *softdep_lookupvp(struct fs *, ino_t); +STATIC struct buf *softdep_lookup_pcbp __P((struct vnode *, ufs_lbn_t)); +void softdep_pageiodone __P((struct buf *)); /* * Exported softdep operations. @@ -176,6 +195,7 @@ struct bio_ops bioops = { softdep_deallocate_dependencies, /* io_deallocate */ softdep_move_dependencies, /* io_movedeps */ softdep_count_dependencies, /* io_countdeps */ + softdep_pageiodone, /* io_pagedone */ }; /* @@ -1055,6 +1075,7 @@ top: void softdep_initialize() { + int i; LIST_INIT(&mkdirlisthd); LIST_INIT(&softdep_workitem_pending); @@ -1073,6 +1094,11 @@ softdep_initialize() newblk_hashtbl = hashinit(64, M_NEWBLK, M_WAITOK, &newblk_hash); sema_init(&newblk_in_progress, "newblk", PRIBIO, 0); timeout_set(&proc_waiting_timeout, pause_timer, 0); + pool_init(&sdpcpool, sizeof(struct buf), 0, 0, 0, "sdpcpool", + 0, pool_page_alloc_nointr, pool_page_free_nointr, M_TEMP); + for (i = 0; i < PCBPHASHSIZE; i++) { + LIST_INIT(&pcbphashhead[i]); + } } /* @@ -1325,11 +1351,16 @@ softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) LIST_REMOVE(newblk, nb_hash); FREE(newblk, M_NEWBLK); + /* + * If we were not passed a bp to attach the dep to, + * then this must be for a regular file. + * Allocate a buffer to represent the page cache pages + * that are the real dependency. The pages themselves + * cannot refer to the dependency since we don't want to + * add a field to struct vm_page for this. + */ if (bp == NULL) { - /* - * XXXUBC - Yes, I know how to fix this, but not right now. - */ - panic("softdep_setup_allocdirect: Bonk art in the head\n"); + bp = softdep_setup_pagecache(ip, lbn, newsize); } WORKLIST_INSERT(&bp->b_dep, &adp->ad_list); if (lbn >= NDADDR) { @@ -1563,10 +1594,7 @@ softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp) pagedep_lookup(ip, lbn, DEPALLOC, &pagedep) == 0) WORKLIST_INSERT(&nbp->b_dep, &pagedep->pd_list); if (nbp == NULL) { - /* - * XXXUBC - Yes, I know how to fix this, but not right now. - */ - panic("softdep_setup_allocindir_page: Bonk art in the head\n"); + nbp = softdep_setup_pagecache(ip, lbn, ip->i_fs->fs_bsize); } WORKLIST_INSERT(&nbp->b_dep, &aip->ai_list); FREE_LOCK(&lk); @@ -1745,6 +1773,7 @@ softdep_setup_freeblocks(ip, length) int i, delay, error; fs = ip->i_fs; + vp = ITOV(ip); if (length != 0) panic("softdep_setup_freeblocks: non-zero length"); MALLOC(freeblks, struct freeblks *, sizeof(struct freeblks), @@ -1804,9 +1833,15 @@ softdep_setup_freeblocks(ip, length) * with this inode are obsolete and can simply be de-allocated. * We must first merge the two dependency lists to get rid of * any duplicate freefrag structures, then purge the merged list. + * We must remove any pagecache markers from the pagecache + * hashtable first because any I/Os in flight will want to see + * dependencies attached to their pagecache markers. We cannot + * free the pagecache markers until after we've freed all the + * dependencies that reference them later. * If we still have a bitmap dependency, then the inode has never * been written to disk, so we can free any fragments without delay. */ + softdep_collect_pagecache(ip); merge_inode_lists(inodedep); while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0) free_allocdirect(&inodedep->id_inoupdt, adp, delay); @@ -1818,7 +1853,6 @@ softdep_setup_freeblocks(ip, length) * Once they are all there, walk the list and get rid of * any dependencies. */ - vp = ITOV(ip); ACQUIRE_LOCK(&lk); drain_output(vp, 1); while (getdirtybuf(&LIST_FIRST(&vp->v_dirtyblkhd), MNT_WAIT)) { @@ -1830,6 +1864,7 @@ softdep_setup_freeblocks(ip, length) brelse(bp); ACQUIRE_LOCK(&lk); } + softdep_free_pagecache(ip); if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) != 0) (void) free_inodedep(inodedep); FREE_LOCK(&lk); @@ -2898,7 +2933,6 @@ handle_workitem_freefile(freefile) struct freefile *freefile; { struct fs *fs; - struct vnode vp; struct inode tip; struct inodedep *idp; int error; @@ -2914,8 +2948,7 @@ handle_workitem_freefile(freefile) tip.i_devvp = freefile->fx_devvp; tip.i_dev = freefile->fx_devvp->v_rdev; tip.i_fs = fs; - tip.i_vnode = &vp; - vp.v_data = &tip; + tip.i_vnode = NULL; if ((error = ffs_freefile(&tip, freefile->fx_oldinum, freefile->fx_mode)) != 0) { @@ -4313,6 +4346,7 @@ flush_inodedep_deps(fs, ino) struct allocdirect *adp; int error, waitfor; struct buf *bp; + struct vnode *vp; /* * This work is done in two passes. The first pass grabs most @@ -4332,6 +4366,27 @@ flush_inodedep_deps(fs, ino) ACQUIRE_LOCK(&lk); if (inodedep_lookup(fs, ino, 0, &inodedep) == 0) return (0); + + /* + * When file data was in the buffer cache, + * softdep_sync_metadata() would start i/o on + * file data buffers itself. But now that + * we're using the page cache to hold file data, + * we need something else to trigger those flushes. + * let's just do it here. + */ + + vp = softdep_lookupvp(fs, ino); + if (vp) { + struct uvm_object *uobj = &vp->v_uvm.u_obj; + + simple_lock(&uobj->vmobjlock); + (uobj->pgops->pgo_flush)(uobj, 0, 0, + PGO_ALLPAGES|PGO_CLEANIT| + (waitfor == MNT_NOWAIT ? 0: PGO_SYNCIO)); + simple_unlock(&uobj->vmobjlock); + } + TAILQ_FOREACH(adp, &inodedep->id_inoupdt, ad_next) { if (adp->ad_state & DEPCOMPLETE) continue; @@ -4944,3 +4999,196 @@ softdep_error(func, error) /* XXX should do something better! */ printf("%s: got error %d while accessing filesystem\n", func, error); } + +/* + * Allocate a buffer on which to attach a dependency. + */ +STATIC struct buf * +softdep_setup_pagecache(ip, lbn, size) + struct inode *ip; + ufs_lbn_t lbn; + long size; +{ + struct vnode *vp = ITOV(ip); + struct buf *bp; + int s; + + /* + * Enter pagecache dependency buf in hash. + */ + + bp = softdep_lookup_pcbp(vp, lbn); + if (bp == NULL) { + s = splbio(); + bp = pool_get(&sdpcpool, PR_WAITOK); + splx(s); + + bp->b_vp = vp; + bp->b_lblkno = lbn; + bp->b_bcount = bp->b_resid = size; + LIST_INIT(&bp->b_dep); + LIST_INSERT_HEAD(&pcbphashhead[PCBPHASH(vp, lbn)], bp, b_hash); + LIST_INSERT_HEAD(&ip->i_pcbufhd, bp, b_vnbufs); + } else { + KASSERT(size >= bp->b_bcount); + bp->b_resid += size - bp->b_bcount; + bp->b_bcount = size; + } + return bp; +} + +/* + * softdep_collect_pagecache() and softdep_free_pagecache() + * are used to remove page cache dependency buffers when + * a file is being truncated to 0. + */ + +STATIC void +softdep_collect_pagecache(ip) + struct inode *ip; +{ + struct buf *bp; + + LIST_FOREACH(bp, &ip->i_pcbufhd, b_vnbufs) { + LIST_REMOVE(bp, b_hash); + } +} + +STATIC void +softdep_free_pagecache(ip) + struct inode *ip; +{ + struct buf *bp, *nextbp; + + for (bp = LIST_FIRST(&ip->i_pcbufhd); bp != NULL; bp = nextbp) { + nextbp = LIST_NEXT(bp, b_vnbufs); + LIST_REMOVE(bp, b_vnbufs); + KASSERT(LIST_FIRST(&bp->b_dep) == NULL); + pool_put(&sdpcpool, bp); + } +} + +STATIC struct vnode * +softdep_lookupvp(fs, ino) + struct fs *fs; + ino_t ino; +{ + struct mount *mp; + extern struct vfsops ffs_vfsops; + + CIRCLEQ_FOREACH(mp, &mountlist, mnt_list) { + if (mp->mnt_op == &ffs_vfsops && + VFSTOUFS(mp)->um_fs == fs) { + break; + } + } + if (mp == NULL) { + return NULL; + } + return ufs_ihashlookup(VFSTOUFS(mp)->um_dev, ino); +} + +STATIC struct buf * +softdep_lookup_pcbp(vp, lbn) + struct vnode *vp; + ufs_lbn_t lbn; +{ + struct buf *bp; + + LIST_FOREACH(bp, &pcbphashhead[PCBPHASH(vp, lbn)], b_hash) { + if (bp->b_vp == vp && bp->b_lblkno == lbn) { + break; + } + } + return bp; +} + +/* + * Do softdep i/o completion processing for page cache writes. + */ + +void +softdep_pageiodone(bp) + struct buf *bp; +{ + int npages = bp->b_bufsize >> PAGE_SHIFT; + struct vnode *vp = bp->b_vp; + struct vm_page *pg; + struct buf *pcbp = NULL; + struct allocdirect *adp; + struct allocindir *aip; + struct worklist *wk; + ufs_lbn_t lbn; + voff_t off; + long iosize = bp->b_bcount; + int size, asize, bshift, bsize; + int i; + + KASSERT(!(bp->b_flags & B_READ)); + bshift = vp->v_mount->mnt_fs_bshift; + bsize = 1 << bshift; + asize = min(PAGE_SIZE, bsize); + ACQUIRE_LOCK(&lk); + for (i = 0; i < npages; i++) { + pg = uvm_pageratop((vaddr_t)bp->b_data + (i << PAGE_SHIFT)); + if (pg == NULL) { + continue; + } + + for (off = pg->offset; + off < pg->offset + PAGE_SIZE; + off += bsize) { + size = min(asize, iosize); + iosize -= size; + lbn = off >> bshift; + if (pcbp == NULL || pcbp->b_lblkno != lbn) { + pcbp = softdep_lookup_pcbp(vp, lbn); + } + if (pcbp == NULL) { + continue; + } + pcbp->b_resid -= size; + if (pcbp->b_resid < 0) { + panic("softdep_pageiodone: " + "resid < 0, vp %p lbn 0x%lx pcbp %p", + vp, lbn, pcbp); + } + if (pcbp->b_resid > 0) { + continue; + } + + /* + * We've completed all the i/o for this block. + * mark the dep complete. + */ + + KASSERT(LIST_FIRST(&pcbp->b_dep) != NULL); + while ((wk = LIST_FIRST(&pcbp->b_dep))) { + WORKLIST_REMOVE(wk); + switch (wk->wk_type) { + case D_ALLOCDIRECT: + adp = WK_ALLOCDIRECT(wk); + adp->ad_state |= COMPLETE; + handle_allocdirect_partdone(adp); + break; + + case D_ALLOCINDIR: + aip = WK_ALLOCINDIR(wk); + aip->ai_state |= COMPLETE; + handle_allocindir_partdone(aip); + break; + + default: + panic("softdep_pageiodone: " + "bad type %d, pcbp %p wk %p", + wk->wk_type, pcbp, wk); + } + } + LIST_REMOVE(pcbp, b_hash); + LIST_REMOVE(pcbp, b_vnbufs); + pool_put(&sdpcpool, pcbp); + pcbp = NULL; + } + } + FREE_LOCK(&lk); +} diff --git a/sys/ufs/ffs/ffs_vfsops.c b/sys/ufs/ffs/ffs_vfsops.c index b1dee123893..19c77726fa8 100644 --- a/sys/ufs/ffs/ffs_vfsops.c +++ b/sys/ufs/ffs/ffs_vfsops.c @@ -1,4 +1,4 @@ -/* $OpenBSD: ffs_vfsops.c,v 1.45 2001/11/21 22:21:48 csapuntz Exp $ */ +/* $OpenBSD: ffs_vfsops.c,v 1.46 2001/11/27 05:27:12 art Exp $ */ /* $NetBSD: ffs_vfsops.c,v 1.19 1996/02/09 22:22:26 christos Exp $ */ /* @@ -737,11 +737,14 @@ ffs_mountfs(devvp, mp, p) else mp->mnt_stat.f_fsid.val[1] = mp->mnt_vfc->vfc_typenum; mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen; + mp->mnt_fs_bshift = fs->fs_bshift; + mp->mnt_dev_bshift = DEV_BSHIFT; mp->mnt_flag |= MNT_LOCAL; ump->um_mountp = mp; ump->um_dev = dev; ump->um_devvp = devvp; ump->um_nindir = fs->fs_nindir; + ump->um_lognindir = ffs(fs->fs_nindir) - 1; ump->um_bptrtodb = fs->fs_fsbtodb; ump->um_seqinc = fs->fs_frag; for (i = 0; i < MAXQUOTAS; i++) @@ -1119,6 +1122,7 @@ retry: ip->i_fs = fs = ump->um_fs; ip->i_dev = dev; ip->i_number = ino; + LIST_INIT(&ip->i_pcbufhd); ip->i_vtbl = &ffs_vtbl; /* @@ -1199,6 +1203,7 @@ retry: ip->i_ffs_uid = ip->i_din.ffs_din.di_ouid; /* XXX */ ip->i_ffs_gid = ip->i_din.ffs_din.di_ogid; /* XXX */ } /* XXX */ + uvm_vnp_setsize(vp, ip->i_ffs_size); *vpp = vp; return (0); diff --git a/sys/ufs/ffs/ffs_vnops.c b/sys/ufs/ffs/ffs_vnops.c index 26e9bbaf9da..8190ef82eb3 100644 --- a/sys/ufs/ffs/ffs_vnops.c +++ b/sys/ufs/ffs/ffs_vnops.c @@ -1,4 +1,4 @@ -/* $OpenBSD: ffs_vnops.c,v 1.20 2001/11/06 19:53:21 miod Exp $ */ +/* $OpenBSD: ffs_vnops.c,v 1.21 2001/11/27 05:27:12 art Exp $ */ /* $NetBSD: ffs_vnops.c,v 1.7 1996/05/11 18:27:24 mycroft Exp $ */ /* @@ -107,8 +107,13 @@ struct vnodeopv_entry_desc ffs_vnodeop_entries[] = { { &vop_advlock_desc, ufs_advlock }, /* advlock */ { &vop_reallocblks_desc, ffs_reallocblks }, /* reallocblks */ { &vop_bwrite_desc, vop_generic_bwrite }, - { (struct vnodeop_desc*)NULL, (int(*) __P((void*)))NULL } + { &vop_ballocn_desc, ffs_ballocn }, + { &vop_getpages_desc, genfs_getpages }, + { &vop_putpages_desc, genfs_putpages }, + { &vop_size_desc, ffs_size }, + { NULL, NULL } }; + struct vnodeopv_desc ffs_vnodeop_opv_desc = { &ffs_vnodeop_p, ffs_vnodeop_entries }; @@ -229,6 +234,7 @@ ffs_fsync(v) struct vnode *vp = ap->a_vp; struct buf *bp, *nbp; int s, error, passes, skipmeta; + struct uvm_object *uobj; if (vp->v_type == VBLK && vp->v_specmountpoint != NULL && @@ -236,13 +242,22 @@ ffs_fsync(v) softdep_fsync_mountdev(vp); /* - * Flush all dirty buffers associated with a vnode. + * Flush all dirty data associated with a vnode. */ passes = NIADDR + 1; skipmeta = 0; if (ap->a_waitfor == MNT_WAIT) skipmeta = 1; s = splbio(); + + if (vp->v_type == VREG) { + uobj = &vp->v_uvm.u_obj; + simple_lock(&uobj->vmobjlock); + (uobj->pgops->pgo_flush)(uobj, 0, 0, PGO_ALLPAGES|PGO_CLEANIT| + ((ap->a_waitfor == MNT_WAIT) ? PGO_SYNCIO : 0)); + simple_unlock(&uobj->vmobjlock); + } + loop: for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = LIST_NEXT(bp, b_vnbufs)) @@ -281,8 +296,10 @@ loop: */ if (passes > 0 || ap->a_waitfor != MNT_WAIT) (void) bawrite(bp); - else if ((error = bwrite(bp)) != 0) + else if ((error = bwrite(bp)) != 0) { + printf("ffs_fsync: bwrite failed %d\n", error); return (error); + } s = splbio(); /* * Since we may have slept during the I/O, we need @@ -325,7 +342,11 @@ loop: } } splx(s); - return (UFS_UPDATE(VTOI(vp), ap->a_waitfor == MNT_WAIT)); + + error = (UFS_UPDATE(VTOI(vp), ap->a_waitfor == MNT_WAIT)); + if (error) + printf("ffs_fsync: UFS_UPDATE failed. %d\n", error); + return (error); } /* @@ -349,3 +370,31 @@ ffs_reclaim(v) vp->v_data = NULL; return (0); } + +/* + * Return the last logical file offset that should be written for this file + * if we're doing a write that ends at "size". + */ +int +ffs_size(v) + void *v; +{ + struct vop_size_args /* { + struct vnode *a_vp; + off_t a_size; + off_t *a_eobp; + } */ *ap = v; + struct inode *ip = VTOI(ap->a_vp); + struct fs *fs = ip->i_fs; + ufs_lbn_t olbn, nlbn; + + olbn = lblkno(fs, ip->i_ffs_size); + nlbn = lblkno(fs, ap->a_size); + + if (nlbn < NDADDR && olbn <= nlbn) { + *ap->a_eobp = fragroundup(fs, ap->a_size); + } else { + *ap->a_eobp = blkroundup(fs, ap->a_size); + } + return 0; +} diff --git a/sys/ufs/ufs/inode.h b/sys/ufs/ufs/inode.h index 5665b276a0f..98c73de5579 100644 --- a/sys/ufs/ufs/inode.h +++ b/sys/ufs/ufs/inode.h @@ -1,4 +1,4 @@ -/* $OpenBSD: inode.h,v 1.16 2001/07/04 06:10:50 angelos Exp $ */ +/* $OpenBSD: inode.h,v 1.17 2001/11/27 05:27:12 art Exp $ */ /* $NetBSD: inode.h,v 1.8 1995/06/15 23:22:50 cgd Exp $ */ /* @@ -84,6 +84,7 @@ struct inode { #define i_e2fs inode_u.e2fs struct cluster_info i_ci; + LIST_HEAD(,buf) i_pcbufhd; struct dquot *i_dquot[MAXQUOTAS]; /* Dquot structures. */ u_quad_t i_modrev; /* Revision level for NFS lease. */ struct lockf *i_lockf;/* Head of byte-level lock list. */ diff --git a/sys/ufs/ufs/ufs_bmap.c b/sys/ufs/ufs/ufs_bmap.c index add641e15ce..fdf5c1be055 100644 --- a/sys/ufs/ufs/ufs_bmap.c +++ b/sys/ufs/ufs/ufs_bmap.c @@ -1,4 +1,4 @@ -/* $OpenBSD: ufs_bmap.c,v 1.10 2001/11/21 22:24:24 csapuntz Exp $ */ +/* $OpenBSD: ufs_bmap.c,v 1.11 2001/11/27 05:27:12 art Exp $ */ /* $NetBSD: ufs_bmap.c,v 1.3 1996/02/09 22:36:00 christos Exp $ */ /* @@ -233,6 +233,7 @@ ufs_getlbns(vp, bn, ap, nump) long metalbn, realbn; struct ufsmount *ump; int64_t blockcnt; + int lbc; int i, numlevels, off; ump = VFSTOUFS(vp->v_mount); @@ -260,10 +261,14 @@ ufs_getlbns(vp, bn, ap, nump) * at the given level of indirection, and NIADDR - i is the number * of levels of indirection needed to locate the requested block. */ - for (blockcnt = 1, i = NIADDR, bn -= NDADDR;; i--, bn -= blockcnt) { + bn -= NDADDR; + for (lbc = 0, i = NIADDR;; i--, bn -= blockcnt) { if (i == 0) return (EFBIG); - blockcnt *= MNINDIR(ump); + + lbc += ump->um_lognindir; + blockcnt = (int64_t)1 << lbc; + if (bn < blockcnt) break; } @@ -289,8 +294,9 @@ ufs_getlbns(vp, bn, ap, nump) if (metalbn == realbn) break; - blockcnt /= MNINDIR(ump); - off = (bn / blockcnt) % MNINDIR(ump); + lbc -= ump->um_lognindir; + blockcnt = (int64_t)1 << lbc; + off = (bn >> lbc) & (MNINDIR(ump) - 1); ++numlevels; ap->in_lbn = metalbn; diff --git a/sys/ufs/ufs/ufs_extern.h b/sys/ufs/ufs/ufs_extern.h index 50175a0ec86..fc39e16b45e 100644 --- a/sys/ufs/ufs/ufs_extern.h +++ b/sys/ufs/ufs/ufs_extern.h @@ -1,4 +1,4 @@ -/* $OpenBSD: ufs_extern.h,v 1.12 2001/11/21 21:23:56 csapuntz Exp $ */ +/* $OpenBSD: ufs_extern.h,v 1.13 2001/11/27 05:27:12 art Exp $ */ /* $NetBSD: ufs_extern.h,v 1.5 1996/02/09 22:36:03 christos Exp $ */ /*- @@ -121,6 +121,7 @@ void ufs_ihashrem __P((struct inode *)); /* ufs_inode.c */ int ufs_init __P((struct vfsconf *)); int ufs_reclaim __P((struct vnode *, struct proc *)); +int ufs_balloc_range __P((struct vnode *, off_t, off_t, struct ucred *, int)); /* ufs_lookup.c */ void ufs_dirbad __P((struct inode *, doff_t, char *)); diff --git a/sys/ufs/ufs/ufs_inode.c b/sys/ufs/ufs/ufs_inode.c index 8a3935632fb..3865342fde0 100644 --- a/sys/ufs/ufs/ufs_inode.c +++ b/sys/ufs/ufs/ufs_inode.c @@ -1,4 +1,4 @@ -/* $OpenBSD: ufs_inode.c,v 1.10 2001/11/21 21:23:56 csapuntz Exp $ */ +/* $OpenBSD: ufs_inode.c,v 1.11 2001/11/27 05:27:12 art Exp $ */ /* $NetBSD: ufs_inode.c,v 1.7 1996/05/11 18:27:52 mycroft Exp $ */ /* @@ -151,3 +151,150 @@ ufs_reclaim(vp, p) ufs_quota_delete(ip); return (0); } + +/* + * allocate a range of blocks in a file. + * after this function returns, any page entirely contained within the range + * will map to invalid data and thus must be overwritten before it is made + * accessible to others. + */ + +int +ufs_balloc_range(vp, off, len, cred, flags) + struct vnode *vp; + off_t off, len; + struct ucred *cred; + int flags; +{ + off_t oldeof, neweof, oldeob, neweob, oldpagestart, pagestart; + struct uvm_object *uobj; + int i, delta, error, npages1, npages2; + int bshift = vp->v_mount->mnt_fs_bshift; + int bsize = 1 << bshift; + int ppb = MAX(bsize >> PAGE_SHIFT, 1); + struct vm_page *pgs1[ppb], *pgs2[ppb]; + UVMHIST_FUNC("ufs_balloc_range"); UVMHIST_CALLED(ubchist); + UVMHIST_LOG(ubchist, "vp %p off 0x%x len 0x%x u_size 0x%x", + vp, off, len, vp->v_uvm.u_size); + + oldeof = vp->v_uvm.u_size; + error = VOP_SIZE(vp, oldeof, &oldeob); + if (error) { + return error; + } + + neweof = MAX(vp->v_uvm.u_size, off + len); + error = VOP_SIZE(vp, neweof, &neweob); + if (error) { + return error; + } + + error = 0; + uobj = &vp->v_uvm.u_obj; + pgs1[0] = pgs2[0] = NULL; + + /* + * if the last block in the file is not a full block (ie. it is a + * fragment), and this allocation is causing the fragment to change + * size (either to expand the fragment or promote it to a full block), + * cache the old last block (at its new size). + */ + + oldpagestart = trunc_page(oldeof) & ~(bsize - 1); + if ((oldeob & (bsize - 1)) != 0 && oldeob != neweob) { + npages1 = MIN(ppb, (round_page(neweob) - oldpagestart) >> + PAGE_SHIFT); + memset(pgs1, 0, npages1 * sizeof(struct vm_page *)); + simple_lock(&uobj->vmobjlock); + error = VOP_GETPAGES(vp, oldpagestart, pgs1, &npages1, + 0, VM_PROT_READ, 0, PGO_SYNCIO|PGO_PASTEOF); + if (error) { + goto out; + } + simple_lock(&uobj->vmobjlock); + uvm_lock_pageq(); + for (i = 0; i < npages1; i++) { + UVMHIST_LOG(ubchist, "got pgs1[%d] %p", i, pgs1[i],0,0); + KASSERT((pgs1[i]->flags & PG_RELEASED) == 0); + pgs1[i]->flags &= ~PG_CLEAN; + uvm_pageactivate(pgs1[i]); + } + uvm_unlock_pageq(); + simple_unlock(&uobj->vmobjlock); + } + + /* + * cache the new range as well. this will create zeroed pages + * where the new block will be and keep them locked until the + * new block is allocated, so there will be no window where + * the old contents of the new block is visible to racing threads. + */ + + pagestart = trunc_page(off) & ~(bsize - 1); + if (pagestart != oldpagestart || pgs1[0] == NULL) { + npages2 = MIN(ppb, (round_page(neweob) - pagestart) >> + PAGE_SHIFT); + memset(pgs2, 0, npages2 * sizeof(struct vm_page *)); + simple_lock(&uobj->vmobjlock); + error = VOP_GETPAGES(vp, pagestart, pgs2, &npages2, 0, + VM_PROT_READ, 0, PGO_SYNCIO|PGO_PASTEOF); + if (error) { + goto out; + } + simple_lock(&uobj->vmobjlock); + uvm_lock_pageq(); + for (i = 0; i < npages2; i++) { + UVMHIST_LOG(ubchist, "got pgs2[%d] %p", i, pgs2[i],0,0); + KASSERT((pgs2[i]->flags & PG_RELEASED) == 0); + pgs2[i]->flags &= ~PG_CLEAN; + uvm_pageactivate(pgs2[i]); + } + uvm_unlock_pageq(); + simple_unlock(&uobj->vmobjlock); + } + + /* + * adjust off to be block-aligned. + */ + + delta = off & (bsize - 1); + off -= delta; + len += delta; + + /* + * now allocate the range. + */ + + lockmgr(&vp->v_glock, LK_EXCLUSIVE, NULL, curproc); + error = VOP_BALLOCN(vp, off, len, cred, flags); + lockmgr(&vp->v_glock, LK_RELEASE, NULL, curproc); + + /* + * unbusy any pages we are holding. + * if we got an error, free any pages we created past the old eob. + */ + +out: + simple_lock(&uobj->vmobjlock); + if (error) { + (void) (uobj->pgops->pgo_flush)(uobj, round_page(oldeob), 0, + PGO_FREE); + } + if (pgs1[0] != NULL) { + uvm_page_unbusy(pgs1, npages1); + + /* + * The data in the frag might be moving to a new disk location. + * We need to flush pages to the new disk locations. + */ + + (uobj->pgops->pgo_flush)(uobj, oldeof & ~(bsize - 1), + MIN((oldeof + bsize) & ~(bsize - 1), neweof), + PGO_CLEANIT | ((flags & B_SYNC) ? PGO_SYNCIO : 0)); + } + if (pgs2[0] != NULL) { + uvm_page_unbusy(pgs2, npages2); + } + simple_unlock(&uobj->vmobjlock); + return error; +} diff --git a/sys/ufs/ufs/ufs_readwrite.c b/sys/ufs/ufs/ufs_readwrite.c index bbf1391dfe5..e0777e4b55f 100644 --- a/sys/ufs/ufs/ufs_readwrite.c +++ b/sys/ufs/ufs/ufs_readwrite.c @@ -1,4 +1,4 @@ -/* $OpenBSD: ufs_readwrite.c,v 1.19 2001/06/27 04:58:49 art Exp $ */ +/* $OpenBSD: ufs_readwrite.c,v 1.20 2001/11/27 05:27:12 art Exp $ */ /* $NetBSD: ufs_readwrite.c,v 1.9 1996/05/11 18:27:57 mycroft Exp $ */ /*- @@ -76,21 +76,22 @@ READ(v) int a_ioflag; struct ucred *a_cred; } */ *ap = v; - register struct vnode *vp; - register struct inode *ip; - register struct uio *uio; - register FS *fs; + struct vnode *vp; + struct inode *ip; + struct uio *uio; + FS *fs; + void *win; + vsize_t bytelen; struct buf *bp; daddr_t lbn, nextlbn; off_t bytesinfile; long size, xfersize, blkoffset; int error; - u_short mode; vp = ap->a_vp; ip = VTOI(vp); - mode = ip->i_ffs_mode; uio = ap->a_uio; + error = 0; #ifdef DIAGNOSTIC if (uio->uio_rw != UIO_READ) @@ -110,6 +111,24 @@ READ(v) if (uio->uio_resid == 0) return (0); + if (uio->uio_offset >= ip->i_ffs_size) + goto out; + + if (vp->v_type == VREG) { + while (uio->uio_resid > 0) { + bytelen = min(ip->i_ffs_size - uio->uio_offset, + uio->uio_resid); + if (bytelen == 0) + break; + win = ubc_alloc(&vp->v_uvm.u_obj, uio->uio_offset, + &bytelen, UBC_READ); + error = uiomove(win, bytelen, uio); + ubc_release(win, 0); + if (error) + break; + } + goto out; + } for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { if ((bytesinfile = ip->i_ffs_size - uio->uio_offset) <= 0) @@ -131,9 +150,6 @@ READ(v) #else if (lblktosize(fs, nextlbn) >= ip->i_ffs_size) error = bread(vp, lbn, size, NOCRED, &bp); - else if (doclusterread) - error = cluster_read(vp, &ip->i_ci, - ip->i_ffs_size, lbn, size, NOCRED, &bp); else if (lbn - 1 == ip->i_ci.ci_lastr) { int nextsize = BLKSIZE(fs, ip, nextlbn); error = breadn(vp, lbn, @@ -158,7 +174,7 @@ READ(v) break; xfersize = size; } - error = uiomove((char *)bp->b_data + blkoffset, (int)xfersize, + error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio); if (error) break; @@ -166,6 +182,7 @@ READ(v) } if (bp != NULL) brelse(bp); +out: ip->i_flag |= IN_ACCESS; return (error); } @@ -183,15 +200,19 @@ WRITE(v) int a_ioflag; struct ucred *a_cred; } */ *ap = v; - register struct vnode *vp; - register struct uio *uio; - register struct inode *ip; - register FS *fs; + struct vnode *vp; + struct uio *uio; + struct inode *ip; + FS *fs; struct buf *bp; struct proc *p; daddr_t lbn; off_t osize; int blkoffset, error, extended, flags, ioflag, resid, size, xfersize; + void *win; + vsize_t bytelen; + off_t oldoff; + boolean_t rv; extended = 0; ioflag = ap->a_ioflag; @@ -239,9 +260,77 @@ WRITE(v) resid = uio->uio_resid; osize = ip->i_ffs_size; - flags = ioflag & IO_SYNC ? B_SYNC : 0; + error = 0; + + if (vp->v_type != VREG) + goto bcache; + + while (uio->uio_resid > 0) { + oldoff = uio->uio_offset; + blkoffset = blkoff(fs, uio->uio_offset); + bytelen = min(fs->fs_bsize - blkoffset, uio->uio_resid); + + /* + * XXXUBC if file is mapped and this is the last block, + * process one page at a time. + */ + + error = ufs_balloc_range(vp, uio->uio_offset, bytelen, + ap->a_cred, ioflag & IO_SYNC ? B_SYNC : 0); + if (error) { + return error; + } - for (error = 0; uio->uio_resid > 0;) { + win = ubc_alloc(&vp->v_uvm.u_obj, uio->uio_offset, &bytelen, + UBC_WRITE); + error = uiomove(win, bytelen, uio); + ubc_release(win, 0); + + /* + * flush what we just wrote if necessary. + * XXXUBC simplistic async flushing. + */ + + if (ioflag & IO_SYNC) { + simple_lock(&vp->v_uvm.u_obj.vmobjlock); +#if 1 + /* + * XXX + * flush whole blocks in case there are deps. + * otherwise we can dirty and flush part of + * a block multiple times and the softdep code + * will get confused. fixing this the right way + * is complicated so we'll work around it for now. + */ + + rv = vp->v_uvm.u_obj.pgops->pgo_flush( + &vp->v_uvm.u_obj, + oldoff & ~(fs->fs_bsize - 1), + (oldoff + bytelen + fs->fs_bsize - 1) & + ~(fs->fs_bsize - 1), + PGO_CLEANIT|PGO_SYNCIO); +#else + rv = vp->v_uvm.u_obj.pgops->pgo_flush( + &vp->v_uvm.u_obj, oldoff, oldoff + bytelen, + PGO_CLEANIT|PGO_SYNCIO); +#endif + simple_unlock(&vp->v_uvm.u_obj.vmobjlock); + } else if (oldoff >> 16 != uio->uio_offset >> 16) { + simple_lock(&vp->v_uvm.u_obj.vmobjlock); + rv = vp->v_uvm.u_obj.pgops->pgo_flush( + &vp->v_uvm.u_obj, (oldoff >> 16) << 16, + (uio->uio_offset >> 16) << 16, PGO_CLEANIT); + simple_unlock(&vp->v_uvm.u_obj.vmobjlock); + } + if (error) { + break; + } + } + goto out; + +bcache: + flags = ioflag & IO_SYNC ? B_SYNC : 0; + while (uio->uio_resid > 0) { lbn = lblkno(fs, uio->uio_offset); blkoffset = blkoff(fs, uio->uio_offset); xfersize = fs->fs_bsize - blkoffset; @@ -260,14 +349,12 @@ WRITE(v) uvm_vnp_setsize(vp, ip->i_ffs_size); extended = 1; } - (void)uvm_vnp_uncache(vp); size = BLKSIZE(fs, ip, lbn) - bp->b_resid; if (size < xfersize) xfersize = size; - error = - uiomove((char *)bp->b_data + blkoffset, xfersize, uio); + error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio); if (error != 0) bzero((char *)bp->b_data + blkoffset, xfersize); @@ -287,13 +374,14 @@ WRITE(v) #endif if (error || xfersize == 0) break; - ip->i_flag |= IN_CHANGE | IN_UPDATE; } /* * If we successfully wrote any data, and we are not the superuser * we clear the setuid and setgid bits as a precaution against * tampering. */ +out: + ip->i_flag |= IN_CHANGE | IN_UPDATE; if (resid > uio->uio_resid && ap->a_cred && ap->a_cred->cr_uid != 0) ip->i_ffs_mode &= ~(ISUID | ISGID); if (resid > uio->uio_resid) diff --git a/sys/ufs/ufs/ufs_vnops.c b/sys/ufs/ufs/ufs_vnops.c index 4caf0ef78c7..e926ee7aff6 100644 --- a/sys/ufs/ufs/ufs_vnops.c +++ b/sys/ufs/ufs/ufs_vnops.c @@ -1,4 +1,4 @@ -/* $OpenBSD: ufs_vnops.c,v 1.39 2001/11/21 21:23:56 csapuntz Exp $ */ +/* $OpenBSD: ufs_vnops.c,v 1.40 2001/11/27 05:27:12 art Exp $ */ /* $NetBSD: ufs_vnops.c,v 1.18 1996/05/11 18:28:04 mycroft Exp $ */ /* @@ -469,8 +469,6 @@ ufs_chmod(vp, mode, cred, p) ip->i_ffs_mode &= ~ALLPERMS; ip->i_ffs_mode |= (mode & ALLPERMS); ip->i_flag |= IN_CHANGE; - if ((vp->v_flag & VTEXT) && (ip->i_ffs_mode & S_ISTXT) == 0) - (void) uvm_vnp_uncache(vp); return (0); } diff --git a/sys/ufs/ufs/ufsmount.h b/sys/ufs/ufs/ufsmount.h index e9dc71f9855..981eb21474b 100644 --- a/sys/ufs/ufs/ufsmount.h +++ b/sys/ufs/ufs/ufsmount.h @@ -1,4 +1,4 @@ -/* $OpenBSD: ufsmount.h,v 1.5 1999/06/01 01:48:52 millert Exp $ */ +/* $OpenBSD: ufsmount.h,v 1.6 2001/11/27 05:27:12 art Exp $ */ /* $NetBSD: ufsmount.h,v 1.4 1994/12/21 20:00:23 mycroft Exp $ */ /* @@ -64,6 +64,7 @@ struct ufsmount { struct vnode *um_quotas[MAXQUOTAS]; /* pointer to quota files */ struct ucred *um_cred[MAXQUOTAS]; /* quota file access cred */ u_long um_nindir; /* indirect ptrs per block */ + u_long um_lognindir; /* log2 of um_nindir */ u_long um_bptrtodb; /* indir ptr to disk block */ u_long um_seqinc; /* inc between seq blocks */ time_t um_btime[MAXQUOTAS]; /* block quota time limit */ diff --git a/sys/uvm/uvm_anon.c b/sys/uvm/uvm_anon.c index 347867e47b8..8478141a72c 100644 --- a/sys/uvm/uvm_anon.c +++ b/sys/uvm/uvm_anon.c @@ -1,4 +1,4 @@ -/* $OpenBSD: uvm_anon.c,v 1.15 2001/11/11 01:16:56 art Exp $ */ +/* $OpenBSD: uvm_anon.c,v 1.16 2001/11/27 05:27:12 art Exp $ */ /* $NetBSD: uvm_anon.c,v 1.15 2001/02/18 21:19:08 chs Exp $ */ /* @@ -518,9 +518,6 @@ anon_pagein(anon) */ pmap_clear_reference(pg); -#ifndef UBC - pmap_page_protect(pg, VM_PROT_NONE); -#endif uvm_lock_pageq(); uvm_pagedeactivate(pg); uvm_unlock_pageq(); diff --git a/sys/uvm/uvm_aobj.c b/sys/uvm/uvm_aobj.c index 85ce0a495f6..0ebf53c3502 100644 --- a/sys/uvm/uvm_aobj.c +++ b/sys/uvm/uvm_aobj.c @@ -1,4 +1,4 @@ -/* $OpenBSD: uvm_aobj.c,v 1.20 2001/11/11 01:16:56 art Exp $ */ +/* $OpenBSD: uvm_aobj.c,v 1.21 2001/11/27 05:27:12 art Exp $ */ /* $NetBSD: uvm_aobj.c,v 1.39 2001/02/18 21:19:08 chs Exp $ */ /* @@ -878,15 +878,8 @@ uao_flush(uobj, start, stop, flags) pp->wire_count != 0) continue; -#ifdef UBC /* ...and deactivate the page. */ pmap_clear_reference(pp); -#else - /* zap all mappings for the page. */ - pmap_page_protect(pp, VM_PROT_NONE); - - /* ...and deactivate the page. */ -#endif uvm_pagedeactivate(pp); continue; @@ -1523,9 +1516,6 @@ uao_pagein_page(aobj, pageidx) * deactivate the page (to put it on a page queue). */ pmap_clear_reference(pg); -#ifndef UBC - pmap_page_protect(pg, VM_PROT_NONE); -#endif uvm_lock_pageq(); uvm_pagedeactivate(pg); uvm_unlock_pageq(); diff --git a/sys/uvm/uvm_bio.c b/sys/uvm/uvm_bio.c new file mode 100644 index 00000000000..fccf51b8ece --- /dev/null +++ b/sys/uvm/uvm_bio.c @@ -0,0 +1,547 @@ +/* $NetBSD: uvm_bio.c,v 1.7 2001/02/02 01:55:52 enami Exp $ */ + +/* + * Copyright (c) 1998 Chuck Silvers. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +/* + * uvm_bio.c: buffered i/o vnode mapping cache + */ + + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/kernel.h> +#include <sys/vnode.h> + +#include <uvm/uvm.h> +#include <uvm/uvm_page.h> + +/* + * global data structures + */ + +/* + * local functions + */ + +static int ubc_fault __P((struct uvm_faultinfo *, vaddr_t, + vm_page_t *, int, int, vm_fault_t, vm_prot_t, + int)); +static struct ubc_map *ubc_find_mapping __P((struct uvm_object *, voff_t)); + +/* + * local data structues + */ + +#define UBC_HASH(uobj, offset) (((((u_long)(uobj)) >> 8) + \ + (((u_long)(offset)) >> PAGE_SHIFT)) & \ + ubc_object.hashmask) + +#define UBC_QUEUE(offset) (&ubc_object.inactive[((offset) / ubc_winsize) & \ + (UBC_NQUEUES - 1)]) + +struct ubc_map +{ + struct uvm_object * uobj; /* mapped object */ + voff_t offset; /* offset into uobj */ + int refcount; /* refcount on mapping */ + voff_t writeoff; /* overwrite offset */ + vsize_t writelen; /* overwrite len */ + + LIST_ENTRY(ubc_map) hash; /* hash table */ + TAILQ_ENTRY(ubc_map) inactive; /* inactive queue */ +}; + +static struct ubc_object +{ + struct uvm_object uobj; /* glue for uvm_map() */ + char *kva; /* where ubc_object is mapped */ + struct ubc_map *umap; /* array of ubc_map's */ + + LIST_HEAD(, ubc_map) *hash; /* hashtable for cached ubc_map's */ + u_long hashmask; /* mask for hashtable */ + + TAILQ_HEAD(ubc_inactive_head, ubc_map) *inactive; + /* inactive queues for ubc_map's */ + +} ubc_object; + +struct uvm_pagerops ubc_pager = +{ + NULL, /* init */ + NULL, /* reference */ + NULL, /* detach */ + ubc_fault, /* fault */ + /* ... rest are NULL */ +}; + +int ubc_nwins = UBC_NWINS; +int ubc_winsize = UBC_WINSIZE; +#ifdef PMAP_PREFER +int ubc_nqueues; +boolean_t ubc_release_unmap = FALSE; +#define UBC_NQUEUES ubc_nqueues +#define UBC_RELEASE_UNMAP ubc_release_unmap +#else +#define UBC_NQUEUES 1 +#define UBC_RELEASE_UNMAP FALSE +#endif + +/* + * ubc_init + * + * init pager private data structures. + */ + +void +ubc_init(void) +{ + struct ubc_map *umap; + vaddr_t va; + int i; + + /* + * init ubc_object. + * alloc and init ubc_map's. + * init inactive queues. + * alloc and init hashtable. + * map in ubc_object. + */ + + simple_lock_init(&ubc_object.uobj.vmobjlock); + ubc_object.uobj.pgops = &ubc_pager; + TAILQ_INIT(&ubc_object.uobj.memq); + ubc_object.uobj.uo_npages = 0; + ubc_object.uobj.uo_refs = UVM_OBJ_KERN; + + ubc_object.umap = malloc(ubc_nwins * sizeof(struct ubc_map), + M_TEMP, M_NOWAIT); + if (ubc_object.umap == NULL) + panic("ubc_init: failed to allocate ubc_map"); + bzero(ubc_object.umap, ubc_nwins * sizeof(struct ubc_map)); + + va = (vaddr_t)1L; +#ifdef PMAP_PREFER + PMAP_PREFER(0, &va); + if (va < ubc_winsize) { + va = ubc_winsize; + } + ubc_nqueues = va / ubc_winsize; + if (ubc_nqueues != 1) { + ubc_release_unmap = TRUE; + } +#endif + ubc_object.inactive = malloc(UBC_NQUEUES * + sizeof(struct ubc_inactive_head), + M_TEMP, M_NOWAIT); + if (ubc_object.inactive == NULL) + panic("ubc_init: failed to allocate inactive queue heads"); + for (i = 0; i < UBC_NQUEUES; i++) { + TAILQ_INIT(&ubc_object.inactive[i]); + } + for (i = 0; i < ubc_nwins; i++) { + umap = &ubc_object.umap[i]; + TAILQ_INSERT_TAIL(&ubc_object.inactive[i & (UBC_NQUEUES - 1)], + umap, inactive); + } + + ubc_object.hash = hashinit(ubc_nwins, M_TEMP, M_NOWAIT, + &ubc_object.hashmask); + for (i = 0; i <= ubc_object.hashmask; i++) { + LIST_INIT(&ubc_object.hash[i]); + } + + if (uvm_map(kernel_map, (vaddr_t *)&ubc_object.kva, + ubc_nwins * ubc_winsize, &ubc_object.uobj, 0, (vsize_t)va, + UVM_MAPFLAG(UVM_PROT_ALL, UVM_PROT_ALL, UVM_INH_NONE, + UVM_ADV_RANDOM, UVM_FLAG_NOMERGE)) + != KERN_SUCCESS) { + panic("ubc_init: failed to map ubc_object\n"); + } + UVMHIST_INIT(ubchist, 300); +} + + +/* + * ubc_fault: fault routine for ubc mapping + */ +static int +ubc_fault(ufi, ign1, ign2, ign3, ign4, fault_type, access_type, flags) + struct uvm_faultinfo *ufi; + vaddr_t ign1; + vm_page_t *ign2; + int ign3, ign4; + vm_fault_t fault_type; + vm_prot_t access_type; + int flags; +{ + struct uvm_object *uobj; + struct vnode *vp; + struct ubc_map *umap; + vaddr_t va, eva, ubc_offset, slot_offset; + int i, error, rv, npages; + struct vm_page *pgs[ubc_winsize >> PAGE_SHIFT], *pg; + UVMHIST_FUNC("ubc_fault"); UVMHIST_CALLED(ubchist); + + /* + * no need to try with PGO_LOCKED... + * we don't need to have the map locked since we know that + * no one will mess with it until our reference is released. + */ + if (flags & PGO_LOCKED) { +#if 0 + return VM_PAGER_UNLOCK; +#else + uvmfault_unlockall(ufi, NULL, &ubc_object.uobj, NULL); + flags &= ~PGO_LOCKED; +#endif + } + + va = ufi->orig_rvaddr; + ubc_offset = va - (vaddr_t)ubc_object.kva; + + UVMHIST_LOG(ubchist, "va 0x%lx ubc_offset 0x%lx at %d", + va, ubc_offset, access_type,0); + + umap = &ubc_object.umap[ubc_offset / ubc_winsize]; + KASSERT(umap->refcount != 0); + slot_offset = trunc_page(ubc_offset & (ubc_winsize - 1)); + + /* no umap locking needed since we have a ref on the umap */ + uobj = umap->uobj; + vp = (struct vnode *)uobj; + KASSERT(uobj != NULL); + + npages = (ubc_winsize - slot_offset) >> PAGE_SHIFT; + + /* + * XXXUBC + * if npages is more than 1 we have to be sure that + * we set PGO_OVERWRITE correctly. + */ + if (access_type == VM_PROT_WRITE) { + npages = 1; + } + +again: + memset(pgs, 0, sizeof (pgs)); + simple_lock(&uobj->vmobjlock); + + UVMHIST_LOG(ubchist, "slot_offset 0x%x writeoff 0x%x writelen 0x%x " + "u_size 0x%x", slot_offset, umap->writeoff, umap->writelen, + vp->v_uvm.u_size); + + if (access_type & VM_PROT_WRITE && + slot_offset >= umap->writeoff && + (slot_offset + PAGE_SIZE <= umap->writeoff + umap->writelen || + slot_offset + PAGE_SIZE >= vp->v_uvm.u_size - umap->offset)) { + UVMHIST_LOG(ubchist, "setting PGO_OVERWRITE", 0,0,0,0); + flags |= PGO_OVERWRITE; + } + else { UVMHIST_LOG(ubchist, "NOT setting PGO_OVERWRITE", 0,0,0,0); } + /* XXX be sure to zero any part of the page past EOF */ + + /* + * XXX + * ideally we'd like to pre-fault all of the pages we're overwriting. + * so for PGO_OVERWRITE, we should call VOP_GETPAGES() with all of the + * pages in [writeoff, writeoff+writesize] instead of just the one. + */ + + UVMHIST_LOG(ubchist, "getpages vp %p offset 0x%x npages %d", + uobj, umap->offset + slot_offset, npages, 0); + + error = VOP_GETPAGES(vp, umap->offset + slot_offset, pgs, &npages, 0, + access_type, 0, flags); + UVMHIST_LOG(ubchist, "getpages error %d npages %d", error, npages,0,0); + + if (error == EAGAIN) { + tsleep(&lbolt, PVM, "ubc_fault", 0); + goto again; + } + if (error) { + return VM_PAGER_ERROR; + } + if (npages == 0) { + return VM_PAGER_OK; + } + + va = ufi->orig_rvaddr; + eva = ufi->orig_rvaddr + (npages << PAGE_SHIFT); + + UVMHIST_LOG(ubchist, "va 0x%lx eva 0x%lx", va, eva, 0,0); + simple_lock(&uobj->vmobjlock); + for (i = 0; va < eva; i++, va += PAGE_SIZE) { + UVMHIST_LOG(ubchist, "pgs[%d] = %p", i, pgs[i],0,0); + pg = pgs[i]; + + if (pg == NULL || pg == PGO_DONTCARE) { + continue; + } + if (pg->flags & PG_WANTED) { + wakeup(pg); + } + KASSERT((pg->flags & PG_FAKE) == 0); + if (pg->flags & PG_RELEASED) { + rv = uobj->pgops->pgo_releasepg(pg, NULL); + KASSERT(rv); + continue; + } + KASSERT(access_type == VM_PROT_READ || + (pg->flags & PG_RDONLY) == 0); + + uvm_lock_pageq(); + uvm_pageactivate(pg); + uvm_unlock_pageq(); + + pmap_enter(ufi->orig_map->pmap, va, VM_PAGE_TO_PHYS(pg), + VM_PROT_READ | VM_PROT_WRITE, access_type); + + pg->flags &= ~(PG_BUSY); + UVM_PAGE_OWN(pg, NULL); + } + simple_unlock(&uobj->vmobjlock); + return VM_PAGER_OK; +} + +/* + * local functions + */ + +static struct ubc_map * +ubc_find_mapping(uobj, offset) + struct uvm_object *uobj; + voff_t offset; +{ + struct ubc_map *umap; + + LIST_FOREACH(umap, &ubc_object.hash[UBC_HASH(uobj, offset)], hash) { + if (umap->uobj == uobj && umap->offset == offset) { + return umap; + } + } + return NULL; +} + + +/* + * ubc interface functions + */ + +/* + * ubc_alloc: allocate a buffer mapping + */ +void * +ubc_alloc(uobj, offset, lenp, flags) + struct uvm_object *uobj; + voff_t offset; + vsize_t *lenp; + int flags; +{ + int s; + vaddr_t slot_offset, va; + struct ubc_map *umap; + voff_t umap_offset; + UVMHIST_FUNC("ubc_alloc"); UVMHIST_CALLED(ubchist); + + UVMHIST_LOG(ubchist, "uobj %p offset 0x%lx len 0x%lx filesize 0x%x", + uobj, offset, *lenp, ((struct uvm_vnode *)uobj)->u_size); + + umap_offset = (offset & ~((voff_t)ubc_winsize - 1)); + slot_offset = (vaddr_t)(offset & ((voff_t)ubc_winsize - 1)); + *lenp = min(*lenp, ubc_winsize - slot_offset); + + /* + * the vnode is always locked here, so we don't need to add a ref. + */ + + s = splbio(); + +again: + simple_lock(&ubc_object.uobj.vmobjlock); + umap = ubc_find_mapping(uobj, umap_offset); + if (umap == NULL) { + umap = TAILQ_FIRST(UBC_QUEUE(offset)); + if (umap == NULL) { + simple_unlock(&ubc_object.uobj.vmobjlock); + tsleep(&lbolt, PVM, "ubc_alloc", 0); + goto again; + } + + /* + * remove from old hash (if any), + * add to new hash. + */ + + if (umap->uobj != NULL) { + LIST_REMOVE(umap, hash); + } + + umap->uobj = uobj; + umap->offset = umap_offset; + + LIST_INSERT_HEAD(&ubc_object.hash[UBC_HASH(uobj, umap_offset)], + umap, hash); + + va = (vaddr_t)(ubc_object.kva + + (umap - ubc_object.umap) * ubc_winsize); + pmap_remove(pmap_kernel(), va, va + ubc_winsize); + } + + if (umap->refcount == 0) { + TAILQ_REMOVE(UBC_QUEUE(offset), umap, inactive); + } + +#ifdef DIAGNOSTIC + if ((flags & UBC_WRITE) && + (umap->writeoff || umap->writelen)) { + panic("ubc_fault: concurrent writes vp %p", uobj); + } +#endif + if (flags & UBC_WRITE) { + umap->writeoff = slot_offset; + umap->writelen = *lenp; + } + + umap->refcount++; + simple_unlock(&ubc_object.uobj.vmobjlock); + splx(s); + UVMHIST_LOG(ubchist, "umap %p refs %d va %p", + umap, umap->refcount, + ubc_object.kva + (umap - ubc_object.umap) * ubc_winsize,0); + + return ubc_object.kva + + (umap - ubc_object.umap) * ubc_winsize + slot_offset; +} + + +void +ubc_release(va, wlen) + void *va; + vsize_t wlen; +{ + struct ubc_map *umap; + struct uvm_object *uobj; + int s; + UVMHIST_FUNC("ubc_release"); UVMHIST_CALLED(ubchist); + + UVMHIST_LOG(ubchist, "va %p", va,0,0,0); + + s = splbio(); + simple_lock(&ubc_object.uobj.vmobjlock); + + umap = &ubc_object.umap[((char *)va - ubc_object.kva) / ubc_winsize]; + uobj = umap->uobj; + KASSERT(uobj != NULL); + + umap->writeoff = 0; + umap->writelen = 0; + umap->refcount--; + if (umap->refcount == 0) { + if (UBC_RELEASE_UNMAP && + (((struct vnode *)uobj)->v_flag & VTEXT)) { + vaddr_t va; + + /* + * if this file is the executable image of + * some process, that process will likely have + * the file mapped at an alignment other than + * what PMAP_PREFER() would like. we'd like + * to have process text be able to use the + * cache even if someone is also reading the + * file, so invalidate mappings of such files + * as soon as possible. + */ + + va = (vaddr_t)(ubc_object.kva + + (umap - ubc_object.umap) * ubc_winsize); + pmap_remove(pmap_kernel(), va, va + ubc_winsize); + LIST_REMOVE(umap, hash); + umap->uobj = NULL; + TAILQ_INSERT_HEAD(UBC_QUEUE(umap->offset), umap, + inactive); + } else { + TAILQ_INSERT_TAIL(UBC_QUEUE(umap->offset), umap, + inactive); + } + } + UVMHIST_LOG(ubchist, "umap %p refs %d", umap, umap->refcount,0,0); + simple_unlock(&ubc_object.uobj.vmobjlock); + splx(s); +} + + +/* + * removing a range of mappings from the ubc mapping cache. + */ + +void +ubc_flush(uobj, start, end) + struct uvm_object *uobj; + voff_t start, end; +{ + struct ubc_map *umap; + vaddr_t va; + int s; + UVMHIST_FUNC("ubc_flush"); UVMHIST_CALLED(ubchist); + + UVMHIST_LOG(ubchist, "uobj %p start 0x%lx end 0x%lx", + uobj, start, end,0); + + s = splbio(); + simple_lock(&ubc_object.uobj.vmobjlock); + for (umap = ubc_object.umap; + umap < &ubc_object.umap[ubc_nwins]; + umap++) { + + if (umap->uobj != uobj || + umap->offset < start || + (umap->offset >= end && end != 0) || + umap->refcount > 0) { + continue; + } + + /* + * remove from hash, + * move to head of inactive queue. + */ + + va = (vaddr_t)(ubc_object.kva + + (umap - ubc_object.umap) * ubc_winsize); + pmap_remove(pmap_kernel(), va, va + ubc_winsize); + + LIST_REMOVE(umap, hash); + umap->uobj = NULL; + TAILQ_REMOVE(UBC_QUEUE(umap->offset), umap, inactive); + TAILQ_INSERT_HEAD(UBC_QUEUE(umap->offset), umap, inactive); + } + simple_unlock(&ubc_object.uobj.vmobjlock); + splx(s); +} diff --git a/sys/uvm/uvm_extern.h b/sys/uvm/uvm_extern.h index 5575021ad6f..bb6b841f0ca 100644 --- a/sys/uvm/uvm_extern.h +++ b/sys/uvm/uvm_extern.h @@ -1,4 +1,4 @@ -/* $OpenBSD: uvm_extern.h,v 1.33 2001/11/12 01:26:09 art Exp $ */ +/* $OpenBSD: uvm_extern.h,v 1.34 2001/11/27 05:27:12 art Exp $ */ /* $NetBSD: uvm_extern.h,v 1.57 2001/03/09 01:02:12 chs Exp $ */ /* @@ -223,6 +223,21 @@ typedef int vm_prot_t; #define UVM_PGA_ZERO 0x0002 /* returned page must be zero'd */ /* + * the following defines are for ubc_alloc's flags + */ +#define UBC_READ 0 +#define UBC_WRITE 1 + +/* + * flags for uvn_findpages(). + */ +#define UFP_ALL 0x0 +#define UFP_NOWAIT 0x1 +#define UFP_NOALLOC 0x2 +#define UFP_NOCACHE 0x4 +#define UFP_NORDONLY 0x8 + +/* * lockflags that control the locking behavior of various functions. */ #define UVM_LK_ENTER 0x00000001 /* map locked on entry */ @@ -464,9 +479,16 @@ void uao_detach_locked __P((struct uvm_object *)); void uao_reference __P((struct uvm_object *)); void uao_reference_locked __P((struct uvm_object *)); +/* uvm_bio.c */ +void ubc_init __P((void)); +void * ubc_alloc __P((struct uvm_object *, voff_t, vsize_t *, + int)); +void ubc_release __P((void *, vsize_t)); +void ubc_flush __P((struct uvm_object *, voff_t, voff_t)); + /* uvm_fault.c */ -int uvm_fault __P((vm_map_t, vaddr_t, - vm_fault_t, vm_prot_t)); +int uvm_fault __P((vm_map_t, vaddr_t, vm_fault_t, + vm_prot_t)); /* handle a page fault */ /* uvm_glue.c */ @@ -593,10 +615,11 @@ int uvm_deallocate __P((vm_map_t, vaddr_t, vsize_t)); /* uvm_vnode.c */ void uvm_vnp_setsize __P((struct vnode *, voff_t)); void uvm_vnp_sync __P((struct mount *)); -void uvm_vnp_terminate __P((struct vnode *)); - /* terminate a uvm/uvn object */ -boolean_t uvm_vnp_uncache __P((struct vnode *)); struct uvm_object *uvn_attach __P((void *, vm_prot_t)); +void uvn_findpages __P((struct uvm_object *, voff_t, + int *, struct vm_page **, int)); +void uvm_vnp_zerorange __P((struct vnode *, off_t, size_t)); +void uvm_vnp_asyncget __P((struct vnode *, off_t, size_t)); /* kern_malloc.c */ void kmeminit_nkmempages __P((void)); diff --git a/sys/uvm/uvm_fault.c b/sys/uvm/uvm_fault.c index 662e2509321..0e4103fe49b 100644 --- a/sys/uvm/uvm_fault.c +++ b/sys/uvm/uvm_fault.c @@ -1,4 +1,4 @@ -/* $OpenBSD: uvm_fault.c,v 1.24 2001/11/12 01:26:09 art Exp $ */ +/* $OpenBSD: uvm_fault.c,v 1.25 2001/11/27 05:27:12 art Exp $ */ /* $NetBSD: uvm_fault.c,v 1.56 2001/02/18 21:19:08 chs Exp $ */ /* @@ -204,11 +204,7 @@ uvmfault_anonflush(anons, n) if (pg && (pg->flags & PG_BUSY) == 0 && pg->loan_count == 0) { uvm_lock_pageq(); if (pg->wire_count == 0) { -#ifdef UBC pmap_clear_reference(pg); -#else - pmap_page_protect(pg, VM_PROT_NONE); -#endif uvm_pagedeactivate(pg); } uvm_unlock_pageq(); diff --git a/sys/uvm/uvm_map.h b/sys/uvm/uvm_map.h index bbc2afb9f19..2c95aff1607 100644 --- a/sys/uvm/uvm_map.h +++ b/sys/uvm/uvm_map.h @@ -1,4 +1,4 @@ -/* $OpenBSD: uvm_map.h,v 1.15 2001/11/12 01:26:09 art Exp $ */ +/* $OpenBSD: uvm_map.h,v 1.16 2001/11/27 05:27:12 art Exp $ */ /* $NetBSD: uvm_map.h,v 1.24 2001/02/18 21:19:08 chs Exp $ */ /* @@ -427,7 +427,7 @@ vm_map_lock(map) simple_lock(&map->flags_lock); while (map->flags & VM_MAP_BUSY) { map->flags |= VM_MAP_WANTLOCK; - ltsleep(&map->flags, PVM, (char *)vmmapbsy, 0, &map->flags_lock); + ltsleep(&map->flags, PVM, vmmapbsy, 0, &map->flags_lock); } error = lockmgr(&map->lock, LK_EXCLUSIVE|LK_SLEEPFAIL|LK_INTERLOCK, diff --git a/sys/uvm/uvm_mmap.c b/sys/uvm/uvm_mmap.c index 6bd7260b6a0..3c4c4bdf961 100644 --- a/sys/uvm/uvm_mmap.c +++ b/sys/uvm/uvm_mmap.c @@ -1,4 +1,4 @@ -/* $OpenBSD: uvm_mmap.c,v 1.27 2001/11/12 01:26:09 art Exp $ */ +/* $OpenBSD: uvm_mmap.c,v 1.28 2001/11/27 05:27:12 art Exp $ */ /* $NetBSD: uvm_mmap.c,v 1.49 2001/02/18 21:19:08 chs Exp $ */ /* @@ -1126,40 +1126,8 @@ uvm_mmap(map, addr, size, prot, maxprot, flags, handle, foff, locklimit) uobj = uvn_attach((void *) vp, (flags & MAP_SHARED) ? maxprot : (maxprot & ~VM_PROT_WRITE)); -#ifndef UBC - /* - * XXXCDC: hack from old code - * don't allow vnodes which have been mapped - * shared-writeable to persist [forces them to be - * flushed out when last reference goes]. - * XXXCDC: interesting side effect: avoids a bug. - * note that in WRITE [ufs_readwrite.c] that we - * allocate buffer, uncache, and then do the write. - * the problem with this is that if the uncache causes - * VM data to be flushed to the same area of the file - * we are writing to... in that case we've got the - * buffer locked and our process goes to sleep forever. - * - * XXXCDC: checking maxprot protects us from the - * "persistbug" program but this is not a long term - * solution. - * - * XXXCDC: we don't bother calling uncache with the vp - * VOP_LOCKed since we know that we are already - * holding a valid reference to the uvn (from the - * uvn_attach above), and thus it is impossible for - * the uncache to kill the uvn and trigger I/O. - */ - if (flags & MAP_SHARED) { - if ((prot & VM_PROT_WRITE) || - (maxprot & VM_PROT_WRITE)) { - uvm_vnp_uncache(vp); - } - } -#else /* XXX for now, attach doesn't gain a ref */ VREF(vp); -#endif } else { uobj = udv_attach((void *) &vp->v_rdev, (flags & MAP_SHARED) ? maxprot : diff --git a/sys/uvm/uvm_page.c b/sys/uvm/uvm_page.c index 4ea890c8c3b..f7ebbd77f80 100644 --- a/sys/uvm/uvm_page.c +++ b/sys/uvm/uvm_page.c @@ -1,4 +1,4 @@ -/* $OpenBSD: uvm_page.c,v 1.31 2001/11/12 01:26:09 art Exp $ */ +/* $OpenBSD: uvm_page.c,v 1.32 2001/11/27 05:27:12 art Exp $ */ /* $NetBSD: uvm_page.c,v 1.51 2001/03/09 01:02:12 chs Exp $ */ /* @@ -906,17 +906,11 @@ uvm_pagealloc_strat(obj, off, anon, flags, strat, free_list) * the pagedaemon. */ -#ifdef UBC if (uvmexp.free + uvmexp.paging < uvmexp.freemin || (uvmexp.free + uvmexp.paging < uvmexp.freetarg && uvmexp.inactive < uvmexp.inactarg)) { wakeup(&uvm.pagedaemon); } -#else - if (uvmexp.free < uvmexp.freemin || (uvmexp.free < uvmexp.freetarg && - uvmexp.inactive < uvmexp.inactarg)) - wakeup(&uvm.pagedaemon); -#endif /* * fail if any of these conditions is true: diff --git a/sys/uvm/uvm_page_i.h b/sys/uvm/uvm_page_i.h index e0547d8414b..3ea680714c6 100644 --- a/sys/uvm/uvm_page_i.h +++ b/sys/uvm/uvm_page_i.h @@ -1,4 +1,4 @@ -/* $OpenBSD: uvm_page_i.h,v 1.10 2001/11/12 01:26:10 art Exp $ */ +/* $OpenBSD: uvm_page_i.h,v 1.11 2001/11/27 05:27:12 art Exp $ */ /* $NetBSD: uvm_page_i.h,v 1.16 2001/01/28 23:30:45 thorpej Exp $ */ /* @@ -219,9 +219,6 @@ uvm_pagedeactivate(pg) TAILQ_INSERT_TAIL(&uvm.page_inactive_obj, pg, pageq); pg->pqflags |= PQ_INACTIVE; uvmexp.inactive++; -#ifndef UBC - pmap_clear_reference(pg); -#endif /* * update the "clean" bit. this isn't 100% * accurate, and doesn't have to be. we'll diff --git a/sys/uvm/uvm_pager.c b/sys/uvm/uvm_pager.c index 69400e5f010..2fded9caf08 100644 --- a/sys/uvm/uvm_pager.c +++ b/sys/uvm/uvm_pager.c @@ -1,4 +1,4 @@ -/* $OpenBSD: uvm_pager.c,v 1.22 2001/11/12 01:26:10 art Exp $ */ +/* $OpenBSD: uvm_pager.c,v 1.23 2001/11/27 05:27:12 art Exp $ */ /* $NetBSD: uvm_pager.c,v 1.41 2001/02/18 19:26:50 chs Exp $ */ /* @@ -58,17 +58,13 @@ struct pool *uvm_aiobuf_pool; extern struct uvm_pagerops uvm_deviceops; extern struct uvm_pagerops uvm_vnodeops; -#ifdef UBC extern struct uvm_pagerops ubc_pager; -#endif struct uvm_pagerops *uvmpagerops[] = { &aobj_pager, &uvm_deviceops, &uvm_vnodeops, -#ifdef UBC &ubc_pager, -#endif }; /* @@ -153,7 +149,7 @@ ReStart: kva = 0; /* let system choose VA */ if (uvm_map(pager_map, &kva, size, NULL, - UVM_UNKNOWN_OFFSET, 0, UVM_FLAG_NOMERGE) != KERN_SUCCESS) { + UVM_UNKNOWN_OFFSET, 0, UVM_FLAG_NOMERGE) != KERN_SUCCESS) { if (curproc == uvm.pagedaemon_proc) { simple_lock(&pager_map_wanted_lock); if (emerginuse) { @@ -733,7 +729,6 @@ uvm_pager_dropcluster(uobj, pg, ppsp, npages, flags) } } -#ifdef UBC /* * interrupt-context iodone handler for nested i/o bufs. * @@ -757,7 +752,6 @@ uvm_aio_biodone1(bp) biodone(mbp); } } -#endif /* * interrupt-context iodone handler for single-buf i/os @@ -798,12 +792,10 @@ uvm_aio_aiodone(bp) error = (bp->b_flags & B_ERROR) ? (bp->b_error ? bp->b_error : EIO) : 0; write = (bp->b_flags & B_READ) == 0; -#ifdef UBC /* XXXUBC B_NOCACHE is for swap pager, should be done differently */ if (write && !(bp->b_flags & B_NOCACHE) && bioops.io_pageiodone) { (*bioops.io_pageiodone)(bp); } -#endif uobj = NULL; for (i = 0; i < npages; i++) { diff --git a/sys/uvm/uvm_param.h b/sys/uvm/uvm_param.h index d7cdccc28a4..78b3f1bc5ba 100644 --- a/sys/uvm/uvm_param.h +++ b/sys/uvm/uvm_param.h @@ -1,4 +1,4 @@ -/* $OpenBSD: uvm_param.h,v 1.2 2001/11/12 01:26:10 art Exp $ */ +/* $OpenBSD: uvm_param.h,v 1.3 2001/11/27 05:27:12 art Exp $ */ /* $NetBSD: uvm_param.h,v 1.5 2001/03/09 01:02:12 chs Exp $ */ /* @@ -114,7 +114,7 @@ typedef int boolean_t; #define VM_ANONMIN 7 #define VM_VTEXTMIN 8 #define VM_VNODEMIN 9 -#define VM_MAXID 9 /* number of valid vm ids */ +#define VM_MAXID 10 /* number of valid vm ids */ #define CTL_VM_NAMES { \ { 0, 0 }, \ @@ -166,10 +166,8 @@ struct _ps_strings { #define trunc_page(x) ((x) & ~PAGE_MASK) extern psize_t mem_size; /* size of physical memory (bytes) */ -#ifdef UBC extern int ubc_nwins; /* number of UBC mapping windows */ extern int ubc_winsize; /* size of a UBC mapping window */ -#endif #else /* out-of-kernel versions of round_page and trunc_page */ diff --git a/sys/uvm/uvm_swap.c b/sys/uvm/uvm_swap.c index 4697d8a23f6..c4298200688 100644 --- a/sys/uvm/uvm_swap.c +++ b/sys/uvm/uvm_swap.c @@ -1,4 +1,4 @@ -/* $OpenBSD: uvm_swap.c,v 1.41 2001/11/15 23:15:15 art Exp $ */ +/* $OpenBSD: uvm_swap.c,v 1.42 2001/11/27 05:27:12 art Exp $ */ /* $NetBSD: uvm_swap.c,v 1.46 2001/02/18 21:19:08 chs Exp $ */ /* @@ -1393,32 +1393,6 @@ sw_reg_strategy(sdp, bp, bn) nbp->vb_buf.b_vnbufs.le_next = NOLIST; LIST_INIT(&nbp->vb_buf.b_dep); - /* - * set b_dirtyoff/end and b_validoff/end. this is - * required by the NFS client code (otherwise it will - * just discard our I/O request). - */ - if (bp->b_dirtyend == 0) { - nbp->vb_buf.b_dirtyoff = 0; - nbp->vb_buf.b_dirtyend = sz; - } else { - nbp->vb_buf.b_dirtyoff = - max(0, bp->b_dirtyoff - (bp->b_bcount-resid)); - nbp->vb_buf.b_dirtyend = - min(sz, - max(0, bp->b_dirtyend - (bp->b_bcount-resid))); - } - if (bp->b_validend == 0) { - nbp->vb_buf.b_validoff = 0; - nbp->vb_buf.b_validend = sz; - } else { - nbp->vb_buf.b_validoff = - max(0, bp->b_validoff - (bp->b_bcount-resid)); - nbp->vb_buf.b_validend = - min(sz, - max(0, bp->b_validend - (bp->b_bcount-resid))); - } - nbp->vb_xfer = vnx; /* patch it back in to vnx */ /* @@ -1990,8 +1964,6 @@ uvm_swap_io(pps, startslot, npages, flags) * and we bump v_numoutput (counter of number of active outputs). */ if (write) { - bp->b_dirtyoff = 0; - bp->b_dirtyend = npages << PAGE_SHIFT; #ifdef UVM_SWAP_ENCRYPT /* mark the pages in the drum for decryption */ if (swap_encrypt_initalized) diff --git a/sys/uvm/uvm_vnode.c b/sys/uvm/uvm_vnode.c index e921e4fb846..667cbc5b458 100644 --- a/sys/uvm/uvm_vnode.c +++ b/sys/uvm/uvm_vnode.c @@ -1,5 +1,5 @@ -/* $OpenBSD: uvm_vnode.c,v 1.24 2001/11/10 18:42:32 art Exp $ */ -/* $NetBSD: uvm_vnode.c,v 1.36 2000/11/24 20:34:01 chs Exp $ */ +/* $OpenBSD: uvm_vnode.c,v 1.25 2001/11/27 05:27:12 art Exp $ */ +/* $NetBSD: uvm_vnode.c,v 1.47 2001/03/09 01:02:13 chs Exp $ */ /* * Copyright (c) 1997 Charles D. Cranor and Washington University. @@ -52,6 +52,7 @@ #include <sys/param.h> #include <sys/systm.h> +#include <sys/kernel.h> #include <sys/proc.h> #include <sys/malloc.h> #include <sys/vnode.h> @@ -59,6 +60,8 @@ #include <sys/ioctl.h> #include <sys/fcntl.h> #include <sys/conf.h> +#include <sys/pool.h> +#include <sys/mount.h> #include <miscfs/specfs/specdev.h> @@ -66,55 +69,38 @@ #include <uvm/uvm_vnode.h> /* - * private global data structure - * - * we keep a list of writeable active vnode-backed VM objects for sync op. - * we keep a simpleq of vnodes that are currently being sync'd. - */ - -LIST_HEAD(uvn_list_struct, uvm_vnode); -static struct uvn_list_struct uvn_wlist; /* writeable uvns */ -static simple_lock_data_t uvn_wl_lock; /* locks uvn_wlist */ - -SIMPLEQ_HEAD(uvn_sq_struct, uvm_vnode); -static struct uvn_sq_struct uvn_sync_q; /* sync'ing uvns */ -lock_data_t uvn_sync_lock; /* locks sync operation */ - -/* * functions */ -static void uvn_cluster __P((struct uvm_object *, voff_t, - voff_t *, voff_t *)); -static void uvn_detach __P((struct uvm_object *)); -static boolean_t uvn_flush __P((struct uvm_object *, voff_t, - voff_t, int)); -static int uvn_get __P((struct uvm_object *, voff_t, - vm_page_t *, int *, int, - vm_prot_t, int, int)); -static void uvn_init __P((void)); -static int uvn_io __P((struct uvm_vnode *, vm_page_t *, - int, int, int)); -static int uvn_put __P((struct uvm_object *, vm_page_t *, - int, boolean_t)); -static void uvn_reference __P((struct uvm_object *)); -static boolean_t uvn_releasepg __P((struct vm_page *, - struct vm_page **)); +static void uvn_cluster __P((struct uvm_object *, voff_t, voff_t *, + voff_t *)); +static void uvn_detach __P((struct uvm_object *)); +static int uvn_findpage __P((struct uvm_object *, voff_t, + struct vm_page **, int)); +boolean_t uvn_flush __P((struct uvm_object *, voff_t, voff_t, + int)); +static int uvn_get __P((struct uvm_object *, voff_t, vm_page_t *, + int *, int, vm_prot_t, int, int)); +static int uvn_put __P((struct uvm_object *, vm_page_t *, int, + boolean_t)); +static void uvn_reference __P((struct uvm_object *)); +static boolean_t uvn_releasepg __P((struct vm_page *, + struct vm_page **)); /* * master pager structure */ struct uvm_pagerops uvm_vnodeops = { - uvn_init, + NULL, uvn_reference, uvn_detach, - NULL, /* no specialized fault routine required */ + NULL, uvn_flush, uvn_get, uvn_put, uvn_cluster, - uvm_mk_pcluster, /* use generic version of this: see uvm_pager.c */ + uvm_mk_pcluster, uvn_releasepg, }; @@ -123,22 +109,6 @@ struct uvm_pagerops uvm_vnodeops = { */ /* - * uvn_init - * - * init pager private data structures. - */ - -static void -uvn_init() -{ - - LIST_INIT(&uvn_wlist); - simple_lock_init(&uvn_wl_lock); - /* note: uvn_sync_q init'd in uvm_vnp_sync() */ - lockinit(&uvn_sync_lock, PVM, "uvnsync", 0, 0); -} - -/* * uvn_attach * * attach a vnode structure to a VM object. if the vnode is already @@ -161,23 +131,20 @@ uvn_attach(arg, accessprot) struct vnode *vp = arg; struct uvm_vnode *uvn = &vp->v_uvm; struct vattr vattr; - int oldflags, result; + int result; struct partinfo pi; - u_quad_t used_vnode_size; + voff_t used_vnode_size; UVMHIST_FUNC("uvn_attach"); UVMHIST_CALLED(maphist); UVMHIST_LOG(maphist, "(vn=0x%x)", arg,0,0,0); - - used_vnode_size = (u_quad_t)0; /* XXX gcc -Wuninitialized */ + used_vnode_size = (voff_t)0; /* * first get a lock on the uvn. */ simple_lock(&uvn->u_obj.vmobjlock); - while (uvn->u_flags & UVM_VNODE_BLOCKED) { - printf("uvn_attach: blocked at 0x%p flags 0x%x\n", - uvn, uvn->u_flags); - uvn->u_flags |= UVM_VNODE_WANTED; + while (uvn->u_flags & VXLOCK) { + uvn->u_flags |= VXWANT; UVMHIST_LOG(maphist, " SLEEPING on blocked vn",0,0,0,0); UVM_UNLOCK_AND_WAIT(uvn, &uvn->u_obj.vmobjlock, FALSE, "uvn_attach", 0); @@ -189,56 +156,26 @@ uvn_attach(arg, accessprot) * if we're mapping a BLK device, make sure it is a disk. */ if (vp->v_type == VBLK && bdevsw[major(vp->v_rdev)].d_type != D_DISK) { - simple_unlock(&uvn->u_obj.vmobjlock); /* drop lock */ + simple_unlock(&uvn->u_obj.vmobjlock); UVMHIST_LOG(maphist,"<- done (VBLK not D_DISK!)", 0,0,0,0); return(NULL); } - /* - * now we have lock and uvn must not be in a blocked state. - * first check to see if it is already active, in which case - * we can bump the reference count, check to see if we need to - * add it to the writeable list, and then return. - */ - if (uvn->u_flags & UVM_VNODE_VALID) { /* already active? */ - - /* regain VREF if we were persisting */ - if (uvn->u_obj.uo_refs == 0) { - VREF(vp); - UVMHIST_LOG(maphist," VREF (reclaim persisting vnode)", - 0,0,0,0); - } - uvn->u_obj.uo_refs++; /* bump uvn ref! */ - - /* check for new writeable uvn */ - if ((accessprot & VM_PROT_WRITE) != 0 && - (uvn->u_flags & UVM_VNODE_WRITEABLE) == 0) { - simple_lock(&uvn_wl_lock); - LIST_INSERT_HEAD(&uvn_wlist, uvn, u_wlist); - simple_unlock(&uvn_wl_lock); - /* we are now on wlist! */ - uvn->u_flags |= UVM_VNODE_WRITEABLE; - } - - /* unlock and return */ - simple_unlock(&uvn->u_obj.vmobjlock); - UVMHIST_LOG(maphist,"<- done, refcnt=%d", uvn->u_obj.uo_refs, - 0, 0, 0); - return (&uvn->u_obj); - } +#ifdef DIAGNOSTIC + if (vp->v_type != VREG) { + panic("uvn_attach: vp %p not VREG", vp); + } +#endif /* - * need to call VOP_GETATTR() to get the attributes, but that could - * block (due to I/O), so we want to unlock the object before calling. - * however, we want to keep anyone else from playing with the object - * while it is unlocked. to do this we set UVM_VNODE_ALOCK which - * prevents anyone from attaching to the vnode until we are done with - * it. + * set up our idea of the size + * if this hasn't been done already. */ - uvn->u_flags = UVM_VNODE_ALOCK; + if (uvn->u_size == VSIZENOTSET) { + + uvn->u_flags |= VXLOCK; simple_unlock(&uvn->u_obj.vmobjlock); /* drop lock in case we sleep */ /* XXX: curproc? */ - if (vp->v_type == VBLK) { /* * We could implement this as a specfs getattr call, but: @@ -252,8 +189,8 @@ uvn_attach(arg, accessprot) DIOCGPART, (caddr_t)&pi, FREAD, curproc); if (result == 0) { /* XXX should remember blocksize */ - used_vnode_size = (u_quad_t)pi.disklab->d_secsize * - (u_quad_t)pi.part->p_size; + used_vnode_size = (voff_t)pi.disklab->d_secsize * + (voff_t)pi.part->p_size; } } else { result = VOP_GETATTR(vp, &vattr, curproc->p_ucred, curproc); @@ -262,58 +199,26 @@ uvn_attach(arg, accessprot) } /* relock object */ - simple_lock(&uvn->u_obj.vmobjlock); + simple_lock(&uvn->u_obj.vmobjlock); + + if (uvn->u_flags & VXWANT) + wakeup(uvn); + uvn->u_flags &= ~(VXLOCK|VXWANT); if (result != 0) { - if (uvn->u_flags & UVM_VNODE_WANTED) - wakeup(uvn); - uvn->u_flags = 0; simple_unlock(&uvn->u_obj.vmobjlock); /* drop lock */ UVMHIST_LOG(maphist,"<- done (VOP_GETATTR FAILED!)", 0,0,0,0); return(NULL); } - - /* - * make sure that the newsize fits within a vaddr_t - * XXX: need to revise addressing data types - */ -#ifdef DEBUG - if (vp->v_type == VBLK) - printf("used_vnode_size = %llu\n", (long long)used_vnode_size); -#endif - - /* - * now set up the uvn. - */ - uvn->u_obj.pgops = &uvm_vnodeops; - TAILQ_INIT(&uvn->u_obj.memq); - uvn->u_obj.uo_npages = 0; - uvn->u_obj.uo_refs = 1; /* just us... */ - oldflags = uvn->u_flags; - uvn->u_flags = UVM_VNODE_VALID|UVM_VNODE_CANPERSIST; - uvn->u_nio = 0; uvn->u_size = used_vnode_size; - /* if write access, we need to add it to the wlist */ - if (accessprot & VM_PROT_WRITE) { - simple_lock(&uvn_wl_lock); - LIST_INSERT_HEAD(&uvn_wlist, uvn, u_wlist); - simple_unlock(&uvn_wl_lock); - uvn->u_flags |= UVM_VNODE_WRITEABLE; /* we are on wlist! */ } - /* - * add a reference to the vnode. this reference will stay as long - * as there is a valid mapping of the vnode. dropped when the - * reference count goes to zero [and we either free or persist]. - */ - VREF(vp); + /* unlock and return */ simple_unlock(&uvn->u_obj.vmobjlock); - if (oldflags & UVM_VNODE_WANTED) - wakeup(uvn); - - UVMHIST_LOG(maphist,"<- done/VREF, ret 0x%x", &uvn->u_obj,0,0,0); - return(&uvn->u_obj); + UVMHIST_LOG(maphist,"<- done, refcnt=%d", uvn->u_obj.uo_refs, + 0, 0, 0); + return (&uvn->u_obj); } @@ -333,23 +238,7 @@ static void uvn_reference(uobj) struct uvm_object *uobj; { -#ifdef DEBUG - struct uvm_vnode *uvn = (struct uvm_vnode *) uobj; -#endif - UVMHIST_FUNC("uvn_reference"); UVMHIST_CALLED(maphist); - - simple_lock(&uobj->vmobjlock); -#ifdef DEBUG - if ((uvn->u_flags & UVM_VNODE_VALID) == 0) { - printf("uvn_reference: ref=%d, flags=0x%x\n", uvn->u_flags, - uobj->uo_refs); - panic("uvn_reference: invalid state"); - } -#endif - uobj->uo_refs++; - UVMHIST_LOG(maphist, "<- done (uobj=0x%x, ref = %d)", - uobj, uobj->uo_refs,0,0); - simple_unlock(&uobj->vmobjlock); + VREF((struct vnode *)uobj); } /* @@ -365,291 +254,7 @@ static void uvn_detach(uobj) struct uvm_object *uobj; { - struct uvm_vnode *uvn; - struct vnode *vp; - int oldflags; - UVMHIST_FUNC("uvn_detach"); UVMHIST_CALLED(maphist); - - simple_lock(&uobj->vmobjlock); - - UVMHIST_LOG(maphist," (uobj=0x%x) ref=%d", uobj,uobj->uo_refs,0,0); - uobj->uo_refs--; /* drop ref! */ - if (uobj->uo_refs) { /* still more refs */ - simple_unlock(&uobj->vmobjlock); - UVMHIST_LOG(maphist, "<- done (rc>0)", 0,0,0,0); - return; - } - - /* - * get other pointers ... - */ - - uvn = (struct uvm_vnode *) uobj; - vp = (struct vnode *) uobj; - - /* - * clear VTEXT flag now that there are no mappings left (VTEXT is used - * to keep an active text file from being overwritten). - */ - vp->v_flag &= ~VTEXT; - - /* - * we just dropped the last reference to the uvn. see if we can - * let it "stick around". - */ - - if (uvn->u_flags & UVM_VNODE_CANPERSIST) { - /* won't block */ - uvn_flush(uobj, 0, 0, PGO_DEACTIVATE|PGO_ALLPAGES); - simple_unlock(&uobj->vmobjlock); - vrele(vp); /* drop vnode reference */ - UVMHIST_LOG(maphist,"<- done/vrele! (persist)", 0,0,0,0); - return; - } - - /* - * its a goner! - */ - - UVMHIST_LOG(maphist," its a goner (flushing)!", 0,0,0,0); - - uvn->u_flags |= UVM_VNODE_DYING; - - /* - * even though we may unlock in flush, no one can gain a reference - * to us until we clear the "dying" flag [because it blocks - * attaches]. we will not do that until after we've disposed of all - * the pages with uvn_flush(). note that before the flush the only - * pages that could be marked PG_BUSY are ones that are in async - * pageout by the daemon. (there can't be any pending "get"'s - * because there are no references to the object). - */ - - (void) uvn_flush(uobj, 0, 0, PGO_CLEANIT|PGO_FREE|PGO_ALLPAGES); - - UVMHIST_LOG(maphist," its a goner (done flush)!", 0,0,0,0); - - /* - * given the structure of this pager, the above flush request will - * create the following state: all the pages that were in the object - * have either been free'd or they are marked PG_BUSY|PG_RELEASED. - * the PG_BUSY bit was set either by us or the daemon for async I/O. - * in either case, if we have pages left we can't kill the object - * yet because i/o is pending. in this case we set the "relkill" - * flag which will cause pgo_releasepg to kill the object once all - * the I/O's are done [pgo_releasepg will be called from the aiodone - * routine or from the page daemon]. - */ - - if (uobj->uo_npages) { /* I/O pending. iodone will free */ -#ifdef DEBUG - /* - * XXXCDC: very unlikely to happen until we have async i/o - * so print a little info message in case it does. - */ - printf("uvn_detach: vn %p has pages left after flush - " - "relkill mode\n", uobj); -#endif - uvn->u_flags |= UVM_VNODE_RELKILL; - simple_unlock(&uobj->vmobjlock); - UVMHIST_LOG(maphist,"<- done! (releasepg will kill obj)", 0, 0, - 0, 0); - return; - } - - /* - * kill object now. note that we can't be on the sync q because - * all references are gone. - */ - if (uvn->u_flags & UVM_VNODE_WRITEABLE) { - simple_lock(&uvn_wl_lock); /* protect uvn_wlist */ - LIST_REMOVE(uvn, u_wlist); - simple_unlock(&uvn_wl_lock); - } -#ifdef DIAGNOSTIC - if (uobj->memq.tqh_first != NULL) - panic("uvn_deref: vnode VM object still has pages afer " - "syncio/free flush"); -#endif - oldflags = uvn->u_flags; - uvn->u_flags = 0; - simple_unlock(&uobj->vmobjlock); - - /* wake up any sleepers */ - if (oldflags & UVM_VNODE_WANTED) - wakeup(uvn); - - /* - * drop our reference to the vnode. - */ - vrele(vp); - UVMHIST_LOG(maphist,"<- done (vrele) final", 0,0,0,0); - - return; -} - -/* - * uvm_vnp_terminate: external hook to clear out a vnode's VM - * - * called in two cases: - * [1] when a persisting vnode vm object (i.e. one with a zero reference - * count) needs to be freed so that a vnode can be reused. this - * happens under "getnewvnode" in vfs_subr.c. if the vnode from - * the free list is still attached (i.e. not VBAD) then vgone is - * called. as part of the vgone trace this should get called to - * free the vm object. this is the common case. - * [2] when a filesystem is being unmounted by force (MNT_FORCE, - * "umount -f") the vgone() function is called on active vnodes - * on the mounted file systems to kill their data (the vnodes become - * "dead" ones [see src/sys/miscfs/deadfs/...]). that results in a - * call here (even if the uvn is still in use -- i.e. has a non-zero - * reference count). this case happens at "umount -f" and during a - * "reboot/halt" operation. - * - * => the caller must XLOCK and VOP_LOCK the vnode before calling us - * [protects us from getting a vnode that is already in the DYING - * state...] - * => unlike uvn_detach, this function must not return until all the - * uvn's pages are disposed of. - * => in case [2] the uvn is still alive after this call, but all I/O - * ops will fail (due to the backing vnode now being "dead"). this - * will prob. kill any process using the uvn due to pgo_get failing. - */ - -void -uvm_vnp_terminate(vp) - struct vnode *vp; -{ - struct uvm_vnode *uvn = &vp->v_uvm; - int oldflags; - UVMHIST_FUNC("uvm_vnp_terminate"); UVMHIST_CALLED(maphist); - - /* - * lock object and check if it is valid - */ - simple_lock(&uvn->u_obj.vmobjlock); - UVMHIST_LOG(maphist, " vp=0x%x, ref=%d, flag=0x%x", vp, - uvn->u_obj.uo_refs, uvn->u_flags, 0); - if ((uvn->u_flags & UVM_VNODE_VALID) == 0) { - simple_unlock(&uvn->u_obj.vmobjlock); - UVMHIST_LOG(maphist, "<- done (not active)", 0, 0, 0, 0); - return; - } - - /* - * must be a valid uvn that is not already dying (because XLOCK - * protects us from that). the uvn can't in the ALOCK state - * because it is valid, and uvn's that are in the ALOCK state haven't - * been marked valid yet. - */ - -#ifdef DEBUG - /* - * debug check: are we yanking the vnode out from under our uvn? - */ - if (uvn->u_obj.uo_refs) { - printf("uvm_vnp_terminate(%p): terminating active vnode " - "(refs=%d)\n", uvn, uvn->u_obj.uo_refs); - } -#endif - - /* - * it is possible that the uvn was detached and is in the relkill - * state [i.e. waiting for async i/o to finish so that releasepg can - * kill object]. we take over the vnode now and cancel the relkill. - * we want to know when the i/o is done so we can recycle right - * away. note that a uvn can only be in the RELKILL state if it - * has a zero reference count. - */ - - if (uvn->u_flags & UVM_VNODE_RELKILL) - uvn->u_flags &= ~UVM_VNODE_RELKILL; /* cancel RELKILL */ - - /* - * block the uvn by setting the dying flag, and then flush the - * pages. (note that flush may unlock object while doing I/O, but - * it will re-lock it before it returns control here). - * - * also, note that we tell I/O that we are already VOP_LOCK'd so - * that uvn_io doesn't attempt to VOP_LOCK again. - * - * XXXCDC: setting VNISLOCKED on an active uvn which is being terminated - * due to a forceful unmount might not be a good idea. maybe we - * need a way to pass in this info to uvn_flush through a - * pager-defined PGO_ constant [currently there are none]. - */ - uvn->u_flags |= UVM_VNODE_DYING|UVM_VNODE_VNISLOCKED; - - (void) uvn_flush(&uvn->u_obj, 0, 0, PGO_CLEANIT|PGO_FREE|PGO_ALLPAGES); - - /* - * as we just did a flush we expect all the pages to be gone or in - * the process of going. sleep to wait for the rest to go [via iosync]. - */ - - while (uvn->u_obj.uo_npages) { -#ifdef DEBUG - struct vm_page *pp; - for (pp = uvn->u_obj.memq.tqh_first ; pp != NULL ; - pp = pp->listq.tqe_next) { - if ((pp->flags & PG_BUSY) == 0) - panic("uvm_vnp_terminate: detected unbusy pg"); - } - if (uvn->u_nio == 0) - panic("uvm_vnp_terminate: no I/O to wait for?"); - printf("uvm_vnp_terminate: waiting for I/O to fin.\n"); - /* - * XXXCDC: this is unlikely to happen without async i/o so we - * put a printf in just to keep an eye on it. - */ -#endif - uvn->u_flags |= UVM_VNODE_IOSYNC; - UVM_UNLOCK_AND_WAIT(&uvn->u_nio, &uvn->u_obj.vmobjlock, FALSE, - "uvn_term",0); - simple_lock(&uvn->u_obj.vmobjlock); - } - - /* - * done. now we free the uvn if its reference count is zero - * (true if we are zapping a persisting uvn). however, if we are - * terminating a uvn with active mappings we let it live ... future - * calls down to the vnode layer will fail. - */ - - oldflags = uvn->u_flags; - if (uvn->u_obj.uo_refs) { - - /* - * uvn must live on it is dead-vnode state until all references - * are gone. restore flags. clear CANPERSIST state. - */ - - uvn->u_flags &= ~(UVM_VNODE_DYING|UVM_VNODE_VNISLOCKED| - UVM_VNODE_WANTED|UVM_VNODE_CANPERSIST); - - } else { - - /* - * free the uvn now. note that the VREF reference is already - * gone [it is dropped when we enter the persist state]. - */ - if (uvn->u_flags & UVM_VNODE_IOSYNCWANTED) - panic("uvm_vnp_terminate: io sync wanted bit set"); - - if (uvn->u_flags & UVM_VNODE_WRITEABLE) { - simple_lock(&uvn_wl_lock); - LIST_REMOVE(uvn, u_wlist); - simple_unlock(&uvn_wl_lock); - } - uvn->u_flags = 0; /* uvn is history, clear all bits */ - } - - if (oldflags & UVM_VNODE_WANTED) - wakeup(uvn); /* object lock still held */ - - simple_unlock(&uvn->u_obj.vmobjlock); - UVMHIST_LOG(maphist, "<- done", 0, 0, 0, 0); - + vrele((struct vnode *)uobj); } /* @@ -662,7 +267,7 @@ uvm_vnp_terminate(vp) * => returns TRUE if page's object is still alive, FALSE if we * killed the page's object. if we return TRUE, then we * return with the object locked. - * => if (nextpgp != NULL) => we return pageq.tqe_next here, and return + * => if (nextpgp != NULL) => we return the next page on the queue, and return * with the page queues locked [for pagedaemon] * => if (nextpgp == NULL) => we return with page queues unlocked [normal case] * => we kill the uvn if it is not referenced and we are suppose to @@ -674,11 +279,7 @@ uvn_releasepg(pg, nextpgp) struct vm_page *pg; struct vm_page **nextpgp; /* OUT */ { - struct uvm_vnode *uvn = (struct uvm_vnode *) pg->uobject; -#ifdef DIAGNOSTIC - if ((pg->flags & PG_RELEASED) == 0) - panic("uvn_releasepg: page not released!"); -#endif + KASSERT(pg->flags & PG_RELEASED); /* * dispose of the page [caller handles PG_WANTED] @@ -686,64 +287,25 @@ uvn_releasepg(pg, nextpgp) pmap_page_protect(pg, VM_PROT_NONE); uvm_lock_pageq(); if (nextpgp) - *nextpgp = pg->pageq.tqe_next; /* next page for daemon */ + *nextpgp = TAILQ_NEXT(pg, pageq); uvm_pagefree(pg); if (!nextpgp) uvm_unlock_pageq(); - /* - * now see if we need to kill the object - */ - if (uvn->u_flags & UVM_VNODE_RELKILL) { - if (uvn->u_obj.uo_refs) - panic("uvn_releasepg: kill flag set on referenced " - "object!"); - if (uvn->u_obj.uo_npages == 0) { - if (uvn->u_flags & UVM_VNODE_WRITEABLE) { - simple_lock(&uvn_wl_lock); - LIST_REMOVE(uvn, u_wlist); - simple_unlock(&uvn_wl_lock); - } -#ifdef DIAGNOSTIC - if (uvn->u_obj.memq.tqh_first) - panic("uvn_releasepg: pages in object with npages == 0"); -#endif - if (uvn->u_flags & UVM_VNODE_WANTED) - /* still holding object lock */ - wakeup(uvn); - - uvn->u_flags = 0; /* DEAD! */ - simple_unlock(&uvn->u_obj.vmobjlock); - return (FALSE); - } - } return (TRUE); } /* - * NOTE: currently we have to use VOP_READ/VOP_WRITE because they go - * through the buffer cache and allow I/O in any size. These VOPs use - * synchronous i/o. [vs. VOP_STRATEGY which can be async, but doesn't - * go through the buffer cache or allow I/O sizes larger than a - * block]. we will eventually want to change this. - * * issues to consider: - * uvm provides the uvm_aiodesc structure for async i/o management. * there are two tailq's in the uvm. structure... one for pending async * i/o and one for "done" async i/o. to do an async i/o one puts - * an aiodesc on the "pending" list (protected by splbio()), starts the + * a buf on the "pending" list (protected by splbio()), starts the * i/o and returns VM_PAGER_PEND. when the i/o is done, we expect * some sort of "i/o done" function to be called (at splbio(), interrupt - * time). this function should remove the aiodesc from the pending list + * time). this function should remove the buf from the pending list * and place it on the "done" list and wakeup the daemon. the daemon * will run at normal spl() and will remove all items from the "done" - * list and call the "aiodone" hook for each done request (see uvm_pager.c). - * [in the old vm code, this was done by calling the "put" routine with - * null arguments which made the code harder to read and understand because - * you had one function ("put") doing two things.] - * - * so the current pager needs: - * int uvn_aiodone(struct uvm_aiodesc *) + * list and call the iodone hook for each done request (see uvm_pager.c). * * => return KERN_SUCCESS (aio finished, free it). otherwise requeue for * later collection. @@ -764,15 +326,17 @@ uvn_releasepg(pg, nextpgp) /* * uvn_flush: flush pages out of a uvm object. * + * => "stop == 0" means flush all pages at or after "start". * => object should be locked by caller. we may _unlock_ the object - * if (and only if) we need to clean a page (PGO_CLEANIT). + * if (and only if) we need to clean a page (PGO_CLEANIT), or + * if PGO_SYNCIO is set and there are pages busy. * we return with the object locked. - * => if PGO_CLEANIT is set, we may block (due to I/O). thus, a caller - * might want to unlock higher level resources (e.g. vm_map) - * before calling flush. - * => if PGO_CLEANIT is not set, then we will neither unlock the object - * or block. - * => if PGO_ALLPAGE is set, then all pages in the object are valid targets + * => if PGO_CLEANIT or PGO_SYNCIO is set, we may block (due to I/O). + * thus, a caller might want to unlock higher level resources + * (e.g. vm_map) before calling flush. + * => if neither PGO_CLEANIT nor PGO_SYNCIO is set, then we will neither + * unlock the object nor block. + * => if PGO_ALLPAGES is set, then all pages in the object are valid targets * for flushing. * => NOTE: we rely on the fact that the object's memq is a TAILQ and * that new pages are inserted on the tail end of the list. thus, @@ -814,39 +378,62 @@ uvn_releasepg(pg, nextpgp) #define UVN_HASH_PENALTY 4 /* XXX: a guess */ -static boolean_t +boolean_t uvn_flush(uobj, start, stop, flags) struct uvm_object *uobj; voff_t start, stop; int flags; { - struct uvm_vnode *uvn = (struct uvm_vnode *) uobj; + struct uvm_vnode *uvn = (struct uvm_vnode *)uobj; + struct vnode *vp = (struct vnode *)uobj; struct vm_page *pp, *ppnext, *ptmp; - struct vm_page *pps[MAXBSIZE >> PAGE_SHIFT], **ppsp; + struct vm_page *pps[256], **ppsp; + int s; int npages, result, lcv; - boolean_t retval, need_iosync, by_list, needs_clean, all; + boolean_t retval, need_iosync, by_list, needs_clean, all, wasclean; voff_t curoff; u_short pp_version; UVMHIST_FUNC("uvn_flush"); UVMHIST_CALLED(maphist); + UVMHIST_LOG(maphist, "uobj %p start 0x%x stop 0x%x flags 0x%x", + uobj, start, stop, flags); + KASSERT(flags & (PGO_CLEANIT|PGO_FREE|PGO_DEACTIVATE)); + + if (uobj->uo_npages == 0) { + s = splbio(); + if (LIST_FIRST(&vp->v_dirtyblkhd) == NULL && + (vp->v_bioflag & VBIOONSYNCLIST)) { + vp->v_bioflag &= ~VBIOONSYNCLIST; + LIST_REMOVE(vp, v_synclist); + } + splx(s); + return TRUE; + } + +#ifdef DIAGNOSTIC + if (uvn->u_size == VSIZENOTSET) { + printf("uvn_flush: size not set vp %p\n", uvn); + vprint("uvn_flush VSIZENOTSET", vp); + flags |= PGO_ALLPAGES; + } +#endif - curoff = 0; /* XXX: shut up gcc */ /* * get init vals and determine how we are going to traverse object */ + if (stop == 0) { + stop = trunc_page(LLONG_MAX); + } + curoff = 0; need_iosync = FALSE; - retval = TRUE; /* return value */ + retval = TRUE; + wasclean = TRUE; if (flags & PGO_ALLPAGES) { all = TRUE; - by_list = TRUE; /* always go by the list */ + by_list = TRUE; } else { start = trunc_page(start); stop = round_page(stop); -#ifdef DEBUG - if (stop > round_page(uvn->u_size)) - printf("uvn_flush: strange, got an out of range " - "flush (fixed)\n"); -#endif all = FALSE; by_list = (uobj->uo_npages <= ((stop - start) >> PAGE_SHIFT) * UVN_HASH_PENALTY); @@ -870,8 +457,7 @@ uvn_flush(uobj, start, stop, flags) if ((flags & PGO_CLEANIT) != 0 && uobj->pgops->pgo_mk_pcluster != NULL) { if (by_list) { - for (pp = uobj->memq.tqh_first ; pp != NULL ; - pp = pp->listq.tqe_next) { + TAILQ_FOREACH(pp, &uobj->memq, listq) { if (!all && (pp->offset < start || pp->offset >= stop)) continue; @@ -895,45 +481,32 @@ uvn_flush(uobj, start, stop, flags) */ if (by_list) { - pp = uobj->memq.tqh_first; + pp = TAILQ_FIRST(&uobj->memq); } else { curoff = start; pp = uvm_pagelookup(uobj, curoff); } - ppnext = NULL; /* XXX: shut up gcc */ - ppsp = NULL; /* XXX: shut up gcc */ - uvm_lock_pageq(); /* page queues locked */ + ppnext = NULL; + ppsp = NULL; + uvm_lock_pageq(); /* locked: both page queues and uobj */ for ( ; (by_list && pp != NULL) || - (!by_list && curoff < stop) ; pp = ppnext) { - + (!by_list && curoff < stop) ; pp = ppnext) { if (by_list) { - - /* - * range check - */ - if (!all && (pp->offset < start || pp->offset >= stop)) { - ppnext = pp->listq.tqe_next; + ppnext = TAILQ_NEXT(pp, listq); continue; } - } else { - - /* - * null check - */ - curoff += PAGE_SIZE; if (pp == NULL) { if (curoff < stop) ppnext = uvm_pagelookup(uobj, curoff); continue; } - } /* @@ -949,24 +522,23 @@ uvn_flush(uobj, start, stop, flags) if ((flags & PGO_CLEANIT) == 0 || (pp->flags & PG_BUSY) != 0) { needs_clean = FALSE; - if ((pp->flags & PG_BUSY) != 0 && - (flags & (PGO_CLEANIT|PGO_SYNCIO)) == - (PGO_CLEANIT|PGO_SYNCIO)) + if (flags & PGO_SYNCIO) need_iosync = TRUE; } else { + /* * freeing: nuke all mappings so we can sync * PG_CLEAN bit with no race */ if ((pp->flags & PG_CLEAN) != 0 && (flags & PGO_FREE) != 0 && - (pp->pqflags & PQ_ACTIVE) != 0) + /* XXX ACTIVE|INACTIVE test unnecessary? */ + (pp->pqflags & (PQ_ACTIVE|PQ_INACTIVE)) != 0) pmap_page_protect(pp, VM_PROT_NONE); if ((pp->flags & PG_CLEAN) != 0 && pmap_is_modified(pp)) pp->flags &= ~(PG_CLEAN); - pp->flags |= PG_CLEANCHK; /* update "hint" */ - + pp->flags |= PG_CLEANCHK; needs_clean = ((pp->flags & PG_CLEAN) == 0); } @@ -974,29 +546,26 @@ uvn_flush(uobj, start, stop, flags) * if we don't need a clean... load ppnext and dispose of pp */ if (!needs_clean) { - /* load ppnext */ if (by_list) - ppnext = pp->listq.tqe_next; + ppnext = TAILQ_NEXT(pp, listq); else { if (curoff < stop) ppnext = uvm_pagelookup(uobj, curoff); } - /* now dispose of pp */ if (flags & PGO_DEACTIVATE) { if ((pp->pqflags & PQ_INACTIVE) == 0 && + (pp->flags & PG_BUSY) == 0 && pp->wire_count == 0) { - pmap_page_protect(pp, VM_PROT_NONE); + pmap_clear_reference(pp); uvm_pagedeactivate(pp); } } else if (flags & PGO_FREE) { if (pp->flags & PG_BUSY) { - /* release busy pages */ pp->flags |= PG_RELEASED; } else { pmap_page_protect(pp, VM_PROT_NONE); - /* removed page from object */ uvm_pagefree(pp); } } @@ -1013,6 +582,7 @@ uvn_flush(uobj, start, stop, flags) * note: locked: uobj and page queues. */ + wasclean = FALSE; pp->flags |= PG_BUSY; /* we 'own' page now */ UVM_PAGE_OWN(pp, "uvn_flush"); pmap_page_protect(pp, VM_PROT_READ); @@ -1023,7 +593,7 @@ ReTry: /* locked: page queues, uobj */ result = uvm_pager_put(uobj, pp, &ppsp, &npages, - flags | PGO_DOACTCLUST, start, stop); + flags | PGO_DOACTCLUST, start, stop); /* unlocked: page queues, uobj */ /* @@ -1046,7 +616,8 @@ ReTry: */ if (result == VM_PAGER_AGAIN) { - /* + + /* * it is unlikely, but page could have been released * while we had the object lock dropped. we ignore * this now and retry the I/O. we will detect and @@ -1073,27 +644,22 @@ ReTry: * we can move on to the next page. */ - if (result == VM_PAGER_PEND) { + if (result == VM_PAGER_PEND && + (flags & (PGO_DEACTIVATE|PGO_FREE)) == 0) { - if ((flags & (PGO_DEACTIVATE|PGO_FREE)) == 0) { - /* - * no per-page ops: refresh ppnext and continue - */ - if (by_list) { - if (pp->version == pp_version) - ppnext = pp->listq.tqe_next; - else - /* reset */ - ppnext = uobj->memq.tqh_first; - } else { - if (curoff < stop) - ppnext = uvm_pagelookup(uobj, - curoff); - } - continue; + /* + * no per-page ops: refresh ppnext and continue + */ + if (by_list) { + if (pp->version == pp_version) + ppnext = TAILQ_NEXT(pp, listq); + else + ppnext = TAILQ_FIRST(&uobj->memq); + } else { + if (curoff < stop) + ppnext = uvm_pagelookup(uobj, curoff); } - - /* need to do anything here? */ + continue; } /* @@ -1120,18 +686,19 @@ ReTry: /* set up next page for outer loop */ if (by_list) { if (pp->version == pp_version) - ppnext = pp->listq.tqe_next; + ppnext = TAILQ_NEXT(pp, listq); else - /* reset */ - ppnext = uobj->memq.tqh_first; + ppnext = TAILQ_FIRST( + &uobj->memq); } else { if (curoff < stop) - ppnext = uvm_pagelookup(uobj, curoff); + ppnext = uvm_pagelookup(uobj, + curoff); } } /* - * verify the page didn't get moved while obj was + * verify the page wasn't moved while obj was * unlocked */ if (result == VM_PAGER_PEND && ptmp->uobject != uobj) @@ -1145,26 +712,32 @@ ReTry: */ if (result != VM_PAGER_PEND) { - if (ptmp->flags & PG_WANTED) + if (ptmp->flags & PG_WANTED) { /* still holding object lock */ wakeup(ptmp); - + } ptmp->flags &= ~(PG_WANTED|PG_BUSY); UVM_PAGE_OWN(ptmp, NULL); if (ptmp->flags & PG_RELEASED) { - - /* pgo_releasepg wants this */ uvm_unlock_pageq(); - if (!uvn_releasepg(ptmp, NULL)) + if (!uvn_releasepg(ptmp, NULL)) { + UVMHIST_LOG(maphist, + "released %p", + ptmp, 0,0,0); return (TRUE); - - uvm_lock_pageq(); /* relock */ - continue; /* next page */ - + } + uvm_lock_pageq(); + continue; } else { - ptmp->flags |= (PG_CLEAN|PG_CLEANCHK); - if ((flags & PGO_FREE) == 0) - pmap_clear_modify(ptmp); + if ((flags & PGO_WEAK) == 0 && + !(result == VM_PAGER_ERROR && + curproc == uvm.pagedaemon_proc)) { + ptmp->flags |= + (PG_CLEAN|PG_CLEANCHK); + if ((flags & PGO_FREE) == 0) { + pmap_clear_modify(ptmp); + } + } } } @@ -1174,11 +747,11 @@ ReTry: if (flags & PGO_DEACTIVATE) { if ((pp->pqflags & PQ_INACTIVE) == 0 && + (pp->flags & PG_BUSY) == 0 && pp->wire_count == 0) { - pmap_page_protect(ptmp, VM_PROT_NONE); + pmap_clear_reference(ptmp); uvm_pagedeactivate(ptmp); } - } else if (flags & PGO_FREE) { if (result == VM_PAGER_PEND) { if ((ptmp->flags & PG_BUSY) != 0) @@ -1187,10 +760,10 @@ ReTry: } else { if (result != VM_PAGER_OK) { printf("uvn_flush: obj=%p, " - "offset=0x%llx. error " - "during pageout.\n", + "offset=0x%llx. error %d\n", pp->uobject, - (long long)pp->offset); + (long long)pp->offset, + result); printf("uvn_flush: WARNING: " "changes to page may be " "lost!\n"); @@ -1200,31 +773,38 @@ ReTry: uvm_pagefree(ptmp); } } - } /* end of "lcv" for loop */ - } /* end of "pp" for loop */ - /* - * done with pagequeues: unlock - */ uvm_unlock_pageq(); - - /* - * now wait for all I/O if required. - */ + s = splbio(); + if ((flags & PGO_CLEANIT) && all && wasclean && + LIST_FIRST(&vp->v_dirtyblkhd) == NULL && + (vp->v_bioflag & VBIOONSYNCLIST)) { + vp->v_bioflag &= ~VBIOONSYNCLIST; + LIST_REMOVE(vp, v_synclist); + } + splx(s); if (need_iosync) { - UVMHIST_LOG(maphist," <<DOING IOSYNC>>",0,0,0,0); - while (uvn->u_nio != 0) { - uvn->u_flags |= UVM_VNODE_IOSYNC; - UVM_UNLOCK_AND_WAIT(&uvn->u_nio, &uvn->u_obj.vmobjlock, - FALSE, "uvn_flush",0); + + /* + * XXX this doesn't use the new two-flag scheme, + * but to use that, all i/o initiators will have to change. + */ + + s = splbio(); + while (vp->v_numoutput != 0) { + UVMHIST_LOG(ubchist, "waiting for vp %p num %d", + vp, vp->v_numoutput,0,0); + + vp->v_bioflag |= VBIOWAIT; + UVM_UNLOCK_AND_WAIT(&vp->v_numoutput, + &uvn->u_obj.vmobjlock, + FALSE, "uvn_flush",0); simple_lock(&uvn->u_obj.vmobjlock); } - if (uvn->u_flags & UVM_VNODE_IOSYNCWANTED) - wakeup(&uvn->u_flags); - uvn->u_flags &= ~(UVM_VNODE_IOSYNC|UVM_VNODE_IOSYNCWANTED); + splx(s); } /* return, with object locked! */ @@ -1248,31 +828,18 @@ uvn_cluster(uobj, offset, loffset, hoffset) voff_t offset; voff_t *loffset, *hoffset; /* OUT */ { - struct uvm_vnode *uvn = (struct uvm_vnode *) uobj; - *loffset = offset; - - if (*loffset >= uvn->u_size) - panic("uvn_cluster: offset out of range"); + struct uvm_vnode *uvn = (struct uvm_vnode *)uobj; - /* - * XXX: old pager claims we could use VOP_BMAP to get maxcontig value. - */ - *hoffset = *loffset + MAXBSIZE; - if (*hoffset > round_page(uvn->u_size)) /* past end? */ - *hoffset = round_page(uvn->u_size); - - return; + *loffset = offset; + *hoffset = MIN(offset + MAXBSIZE, round_page(uvn->u_size)); } /* * uvn_put: flush page data to backing store. * - * => prefer map unlocked (not required) * => object must be locked! we will _unlock_ it before starting I/O. * => flags: PGO_SYNCIO -- use sync. I/O * => note: caller must set PG_CLEAN and pmap_clear_modify (if needed) - * => XXX: currently we use VOP_READ/VOP_WRITE which are only sync. - * [thus we never do async i/o! see iodone comment] */ static int @@ -1281,13 +848,11 @@ uvn_put(uobj, pps, npages, flags) struct vm_page **pps; int npages, flags; { - int retval; - - /* note: object locked */ - retval = uvn_io((struct uvm_vnode*)uobj, pps, npages, flags, UIO_WRITE); - /* note: object unlocked */ + struct vnode *vp = (struct vnode *)uobj; + int error; - return(retval); + error = VOP_PUTPAGES(vp, pps, npages, flags, NULL); + return uvm_errno2vmerror(error); } @@ -1308,551 +873,121 @@ uvn_get(uobj, offset, pps, npagesp, centeridx, access_type, advice, flags) voff_t offset; struct vm_page **pps; /* IN/OUT */ int *npagesp; /* IN (OUT if PGO_LOCKED) */ - int centeridx, advice, flags; + int centeridx; vm_prot_t access_type; + int advice, flags; { - voff_t current_offset; - struct vm_page *ptmp; - int lcv, result, gotpages; - boolean_t done; - UVMHIST_FUNC("uvn_get"); UVMHIST_CALLED(maphist); - UVMHIST_LOG(maphist, "flags=%d", flags,0,0,0); - - /* - * step 1: handled the case where fault data structures are locked. - */ - - if (flags & PGO_LOCKED) { - - /* - * gotpages is the current number of pages we've gotten (which - * we pass back up to caller via *npagesp. - */ - - gotpages = 0; - - /* - * step 1a: get pages that are already resident. only do this - * if the data structures are locked (i.e. the first time - * through). - */ - - done = TRUE; /* be optimistic */ - - for (lcv = 0, current_offset = offset ; lcv < *npagesp ; - lcv++, current_offset += PAGE_SIZE) { - - /* do we care about this page? if not, skip it */ - if (pps[lcv] == PGO_DONTCARE) - continue; - - /* lookup page */ - ptmp = uvm_pagelookup(uobj, current_offset); - - /* to be useful must get a non-busy, non-released pg */ - if (ptmp == NULL || - (ptmp->flags & (PG_BUSY|PG_RELEASED)) != 0) { - if (lcv == centeridx || (flags & PGO_ALLPAGES) - != 0) - done = FALSE; /* need to do a wait or I/O! */ - continue; - } - - /* - * useful page: busy/lock it and plug it in our - * result array - */ - ptmp->flags |= PG_BUSY; /* loan up to caller */ - UVM_PAGE_OWN(ptmp, "uvn_get1"); - pps[lcv] = ptmp; - gotpages++; - - } /* "for" lcv loop */ - - /* - * XXX: given the "advice", should we consider async read-ahead? - * XXX: fault current does deactive of pages behind us. is - * this good (other callers might now). - */ - /* - * XXX: read-ahead currently handled by buffer cache (bread) - * level. - * XXX: no async i/o available. - * XXX: so we don't do anything now. - */ - - /* - * step 1c: now we've either done everything needed or we to - * unlock and do some waiting or I/O. - */ - - *npagesp = gotpages; /* let caller know */ - if (done) - return(VM_PAGER_OK); /* bingo! */ - else - /* EEK! Need to unlock and I/O */ - return(VM_PAGER_UNLOCK); - } - - /* - * step 2: get non-resident or busy pages. - * object is locked. data structures are unlocked. - * - * XXX: because we can't do async I/O at this level we get things - * page at a time (otherwise we'd chunk). the VOP_READ() will do - * async-read-ahead for us at a lower level. - */ - - for (lcv = 0, current_offset = offset ; - lcv < *npagesp ; lcv++, current_offset += PAGE_SIZE) { - - /* skip over pages we've already gotten or don't want */ - /* skip over pages we don't _have_ to get */ - if (pps[lcv] != NULL || (lcv != centeridx && - (flags & PGO_ALLPAGES) == 0)) - continue; - - /* - * we have yet to locate the current page (pps[lcv]). we first - * look for a page that is already at the current offset. if - * we fine a page, we check to see if it is busy or released. - * if that is the case, then we sleep on the page until it is - * no longer busy or released and repeat the lookup. if the - * page we found is neither busy nor released, then we busy it - * (so we own it) and plug it into pps[lcv]. this breaks the - * following while loop and indicates we are ready to move on - * to the next page in the "lcv" loop above. - * - * if we exit the while loop with pps[lcv] still set to NULL, - * then it means that we allocated a new busy/fake/clean page - * ptmp in the object and we need to do I/O to fill in the data. - */ - - while (pps[lcv] == NULL) { /* top of "pps" while loop */ - - /* look for a current page */ - ptmp = uvm_pagelookup(uobj, current_offset); - - /* nope? allocate one now (if we can) */ - if (ptmp == NULL) { - - ptmp = uvm_pagealloc(uobj, current_offset, - NULL, 0); - - /* out of RAM? */ - if (ptmp == NULL) { - simple_unlock(&uobj->vmobjlock); - uvm_wait("uvn_getpage"); - simple_lock(&uobj->vmobjlock); - - /* goto top of pps while loop */ - continue; - } - - /* - * got new page ready for I/O. break pps - * while loop. pps[lcv] is still NULL. - */ - break; - } - - /* page is there, see if we need to wait on it */ - if ((ptmp->flags & (PG_BUSY|PG_RELEASED)) != 0) { - ptmp->flags |= PG_WANTED; - UVM_UNLOCK_AND_WAIT(ptmp, - &uobj->vmobjlock, FALSE, "uvn_get",0); - simple_lock(&uobj->vmobjlock); - continue; /* goto top of pps while loop */ - } - - /* - * if we get here then the page has become resident - * and unbusy between steps 1 and 2. we busy it - * now (so we own it) and set pps[lcv] (so that we - * exit the while loop). - */ - ptmp->flags |= PG_BUSY; - UVM_PAGE_OWN(ptmp, "uvn_get2"); - pps[lcv] = ptmp; - } - - /* - * if we own the a valid page at the correct offset, pps[lcv] - * will point to it. nothing more to do except go to the - * next page. - */ - - if (pps[lcv]) - continue; /* next lcv */ - - /* - * we have a "fake/busy/clean" page that we just allocated. do - * I/O to fill it with valid data. note that object must be - * locked going into uvn_io, but will be unlocked afterwards. - */ - - result = uvn_io((struct uvm_vnode *) uobj, &ptmp, 1, - PGO_SYNCIO, UIO_READ); - - /* - * I/O done. object is unlocked (by uvn_io). because we used - * syncio the result can not be PEND or AGAIN. we must relock - * and check for errors. - */ - - /* lock object. check for errors. */ - simple_lock(&uobj->vmobjlock); - if (result != VM_PAGER_OK) { - if (ptmp->flags & PG_WANTED) - /* object lock still held */ - wakeup(ptmp); - - ptmp->flags &= ~(PG_WANTED|PG_BUSY); - UVM_PAGE_OWN(ptmp, NULL); - uvm_lock_pageq(); - uvm_pagefree(ptmp); - uvm_unlock_pageq(); - simple_unlock(&uobj->vmobjlock); - return(result); - } - - /* - * we got the page! clear the fake flag (indicates valid - * data now in page) and plug into our result array. note - * that page is still busy. - * - * it is the callers job to: - * => check if the page is released - * => unbusy the page - * => activate the page - */ - - ptmp->flags &= ~PG_FAKE; /* data is valid ... */ - pmap_clear_modify(ptmp); /* ... and clean */ - pps[lcv] = ptmp; - - } /* lcv loop */ - - /* - * finally, unlock object and return. - */ - - simple_unlock(&uobj->vmobjlock); - return (VM_PAGER_OK); + struct vnode *vp = (struct vnode *)uobj; + int error; + UVMHIST_FUNC("uvn_get"); UVMHIST_CALLED(ubchist); + + UVMHIST_LOG(ubchist, "vp %p off 0x%x", vp, (int)offset, 0,0); + error = VOP_GETPAGES(vp, offset, pps, npagesp, centeridx, + access_type, advice, flags); + return uvm_errno2vmerror(error); } + /* - * uvn_io: do I/O to a vnode - * - * => prefer map unlocked (not required) - * => object must be locked! we will _unlock_ it before starting I/O. - * => flags: PGO_SYNCIO -- use sync. I/O - * => XXX: currently we use VOP_READ/VOP_WRITE which are only sync. - * [thus we never do async i/o! see iodone comment] + * uvn_findpages: + * return the page for the uobj and offset requested, allocating if needed. + * => uobj must be locked. + * => returned page will be BUSY. */ -static int -uvn_io(uvn, pps, npages, flags, rw) - struct uvm_vnode *uvn; - vm_page_t *pps; - int npages, flags, rw; +void +uvn_findpages(uobj, offset, npagesp, pps, flags) + struct uvm_object *uobj; + voff_t offset; + int *npagesp; + struct vm_page **pps; + int flags; { - struct vnode *vn; - struct uio uio; - struct iovec iov; - vaddr_t kva; - off_t file_offset; - int waitf, result, mapinflags; - size_t got, wanted; - UVMHIST_FUNC("uvn_io"); UVMHIST_CALLED(maphist); - - UVMHIST_LOG(maphist, "rw=%d", rw,0,0,0); - - /* - * init values - */ - - waitf = (flags & PGO_SYNCIO) ? M_WAITOK : M_NOWAIT; - vn = (struct vnode *) uvn; - file_offset = pps[0]->offset; - - /* - * check for sync'ing I/O. - */ - - while (uvn->u_flags & UVM_VNODE_IOSYNC) { - if (waitf == M_NOWAIT) { - simple_unlock(&uvn->u_obj.vmobjlock); - UVMHIST_LOG(maphist,"<- try again (iosync)",0,0,0,0); - return(VM_PAGER_AGAIN); - } - uvn->u_flags |= UVM_VNODE_IOSYNCWANTED; - UVM_UNLOCK_AND_WAIT(&uvn->u_flags, &uvn->u_obj.vmobjlock, - FALSE, "uvn_iosync",0); - simple_lock(&uvn->u_obj.vmobjlock); - } - - /* - * check size - */ - - if (file_offset >= uvn->u_size) { - simple_unlock(&uvn->u_obj.vmobjlock); - UVMHIST_LOG(maphist,"<- BAD (size check)",0,0,0,0); - return(VM_PAGER_BAD); - } - - /* - * first try and map the pages in (without waiting) - */ - - mapinflags = (rw == UIO_READ) ? - UVMPAGER_MAPIN_READ : UVMPAGER_MAPIN_WRITE; - - kva = uvm_pagermapin(pps, npages, mapinflags); - if (kva == 0 && waitf == M_NOWAIT) { - simple_unlock(&uvn->u_obj.vmobjlock); - UVMHIST_LOG(maphist,"<- mapin failed (try again)",0,0,0,0); - return(VM_PAGER_AGAIN); - } - - /* - * ok, now bump u_nio up. at this point we are done with uvn - * and can unlock it. if we still don't have a kva, try again - * (this time with sleep ok). - */ - - uvn->u_nio++; /* we have an I/O in progress! */ - simple_unlock(&uvn->u_obj.vmobjlock); - /* NOTE: object now unlocked */ - if (kva == 0) - kva = uvm_pagermapin(pps, npages, - mapinflags | UVMPAGER_MAPIN_WAITOK); - - /* - * ok, mapped in. our pages are PG_BUSY so they are not going to - * get touched (so we can look at "offset" without having to lock - * the object). set up for I/O. - */ - - /* - * fill out uio/iov - */ - - iov.iov_base = (caddr_t) kva; - wanted = npages << PAGE_SHIFT; - if (file_offset + wanted > uvn->u_size) - wanted = uvn->u_size - file_offset; /* XXX: needed? */ - iov.iov_len = wanted; - uio.uio_iov = &iov; - uio.uio_iovcnt = 1; - uio.uio_offset = file_offset; - uio.uio_segflg = UIO_SYSSPACE; - uio.uio_rw = rw; - uio.uio_resid = wanted; - uio.uio_procp = curproc; - - /* - * do the I/O! (XXX: curproc?) - */ - - UVMHIST_LOG(maphist, "calling VOP",0,0,0,0); - - /* - * This process may already have this vnode locked, if we faulted in - * copyin() or copyout() on a region backed by this vnode - * while doing I/O to the vnode. If this is the case, don't - * panic.. instead, return the error to the user. - * - * XXX this is a stopgap to prevent a panic. - * Ideally, this kind of operation *should* work. - */ - result = 0; - if ((uvn->u_flags & UVM_VNODE_VNISLOCKED) == 0) - result = vn_lock(vn, LK_EXCLUSIVE | LK_RETRY | LK_RECURSEFAIL, curproc); - - if (result == 0) { - /* NOTE: vnode now locked! */ - - if (rw == UIO_READ) - result = VOP_READ(vn, &uio, 0, curproc->p_ucred); - else - result = VOP_WRITE(vn, &uio, 0, curproc->p_ucred); + int i, rv, npages; - if ((uvn->u_flags & UVM_VNODE_VNISLOCKED) == 0) - VOP_UNLOCK(vn, 0, curproc); + rv = 0; + npages = *npagesp; + for (i = 0; i < npages; i++, offset += PAGE_SIZE) { + rv += uvn_findpage(uobj, offset, &pps[i], flags); } - - /* NOTE: vnode now unlocked (unless vnislocked) */ - - UVMHIST_LOG(maphist, "done calling VOP",0,0,0,0); - - /* - * result == unix style errno (0 == OK!) - * - * zero out rest of buffer (if needed) - */ - - if (result == 0) { - got = wanted - uio.uio_resid; - - if (wanted && got == 0) { - result = EIO; /* XXX: error? */ - } else if (got < PAGE_SIZE * npages && rw == UIO_READ) { - memset((void *) (kva + got), 0, - (npages << PAGE_SHIFT) - got); - } - } - - /* - * now remove pager mapping - */ - uvm_pagermapout(kva, npages); - - /* - * now clean up the object (i.e. drop I/O count) - */ - - simple_lock(&uvn->u_obj.vmobjlock); - /* NOTE: object now locked! */ - - uvn->u_nio--; /* I/O DONE! */ - if ((uvn->u_flags & UVM_VNODE_IOSYNC) != 0 && uvn->u_nio == 0) { - wakeup(&uvn->u_nio); - } - simple_unlock(&uvn->u_obj.vmobjlock); - /* NOTE: object now unlocked! */ - - /* - * done! - */ - - UVMHIST_LOG(maphist, "<- done (result %d)", result,0,0,0); - if (result == 0) - return(VM_PAGER_OK); - else - return(VM_PAGER_ERROR); + *npagesp = rv; } -/* - * uvm_vnp_uncache: disable "persisting" in a vnode... when last reference - * is gone we will kill the object (flushing dirty pages back to the vnode - * if needed). - * - * => returns TRUE if there was no uvm_object attached or if there was - * one and we killed it [i.e. if there is no active uvn] - * => called with the vnode VOP_LOCK'd [we will unlock it for I/O, if - * needed] - * - * => XXX: given that we now kill uvn's when a vnode is recycled (without - * having to hold a reference on the vnode) and given a working - * uvm_vnp_sync(), how does that effect the need for this function? - * [XXXCDC: seems like it can die?] - * - * => XXX: this function should DIE once we merge the VM and buffer - * cache. - * - * research shows that this is called in the following places: - * ext2fs_truncate, ffs_truncate, detrunc[msdosfs]: called when vnode - * changes sizes - * ext2fs_write, WRITE [ufs_readwrite], msdosfs_write: called when we - * are written to - * ex2fs_chmod, ufs_chmod: called if VTEXT vnode and the sticky bit - * is off - * ffs_realloccg: when we can't extend the current block and have - * to allocate a new one we call this [XXX: why?] - * nfsrv_rename, rename_files: called when the target filename is there - * and we want to remove it - * nfsrv_remove, sys_unlink: called on file we are removing - * nfsrv_access: if VTEXT and we want WRITE access and we don't uncache - * then return "text busy" - * nfs_open: seems to uncache any file opened with nfs - * vn_writechk: if VTEXT vnode and can't uncache return "text busy" - */ - -boolean_t -uvm_vnp_uncache(vp) - struct vnode *vp; +static int +uvn_findpage(uobj, offset, pgp, flags) + struct uvm_object *uobj; + voff_t offset; + struct vm_page **pgp; + int flags; { - struct uvm_vnode *uvn = &vp->v_uvm; - - /* - * lock uvn part of the vnode and check to see if we need to do anything - */ + struct vm_page *pg; + UVMHIST_FUNC("uvn_findpage"); UVMHIST_CALLED(ubchist); + UVMHIST_LOG(ubchist, "vp %p off 0x%lx", uobj, offset,0,0); - simple_lock(&uvn->u_obj.vmobjlock); - if ((uvn->u_flags & UVM_VNODE_VALID) == 0 || - (uvn->u_flags & UVM_VNODE_BLOCKED) != 0) { - simple_unlock(&uvn->u_obj.vmobjlock); - return(TRUE); + if (*pgp != NULL) { + UVMHIST_LOG(ubchist, "dontcare", 0,0,0,0); + return 0; } + for (;;) { + /* look for an existing page */ + pg = uvm_pagelookup(uobj, offset); + + /* nope? allocate one now */ + if (pg == NULL) { + if (flags & UFP_NOALLOC) { + UVMHIST_LOG(ubchist, "noalloc", 0,0,0,0); + return 0; + } + pg = uvm_pagealloc(uobj, offset, NULL, 0); + if (pg == NULL) { + if (flags & UFP_NOWAIT) { + UVMHIST_LOG(ubchist, "nowait",0,0,0,0); + return 0; + } + simple_unlock(&uobj->vmobjlock); + uvm_wait("uvn_fp1"); + simple_lock(&uobj->vmobjlock); + continue; + } + if (UVM_OBJ_IS_VTEXT(uobj)) { + uvmexp.vtextpages++; + } else { + uvmexp.vnodepages++; + } + UVMHIST_LOG(ubchist, "alloced",0,0,0,0); + break; + } else if (flags & UFP_NOCACHE) { + UVMHIST_LOG(ubchist, "nocache",0,0,0,0); + return 0; + } - /* - * we have a valid, non-blocked uvn. clear persist flag. - * if uvn is currently active we can return now. - */ - - uvn->u_flags &= ~UVM_VNODE_CANPERSIST; - if (uvn->u_obj.uo_refs) { - simple_unlock(&uvn->u_obj.vmobjlock); - return(FALSE); - } - - /* - * uvn is currently persisting! we have to gain a reference to - * it so that we can call uvn_detach to kill the uvn. - */ - - VREF(vp); /* seems ok, even with VOP_LOCK */ - uvn->u_obj.uo_refs++; /* value is now 1 */ - simple_unlock(&uvn->u_obj.vmobjlock); - - -#ifdef DEBUG - /* - * carry over sanity check from old vnode pager: the vnode should - * be VOP_LOCK'd, and we confirm it here. - */ - if (!VOP_ISLOCKED(vp)) { - boolean_t is_ok_anyway = FALSE; -#if defined(NFSCLIENT) - extern int (**nfsv2_vnodeop_p) __P((void *)); - extern int (**spec_nfsv2nodeop_p) __P((void *)); - extern int (**fifo_nfsv2nodeop_p) __P((void *)); - - /* vnode is NOT VOP_LOCKed: some vnode types _never_ lock */ - if (vp->v_op == nfsv2_vnodeop_p || - vp->v_op == spec_nfsv2nodeop_p) { - is_ok_anyway = TRUE; + /* page is there, see if we need to wait on it */ + if ((pg->flags & (PG_BUSY|PG_RELEASED)) != 0) { + if (flags & UFP_NOWAIT) { + UVMHIST_LOG(ubchist, "nowait",0,0,0,0); + return 0; + } + pg->flags |= PG_WANTED; + UVM_UNLOCK_AND_WAIT(pg, &uobj->vmobjlock, 0, + "uvn_fp2", 0); + simple_lock(&uobj->vmobjlock); + continue; } - if (vp->v_op == fifo_nfsv2nodeop_p) { - is_ok_anyway = TRUE; + + /* skip PG_RDONLY pages if requested */ + if ((flags & UFP_NORDONLY) && (pg->flags & PG_RDONLY)) { + UVMHIST_LOG(ubchist, "nordonly",0,0,0,0); + return 0; } -#endif /* defined(NFSSERVER) || defined(NFSCLIENT) */ - if (!is_ok_anyway) - panic("uvm_vnp_uncache: vnode not locked!"); - } -#endif /* DEBUG */ - /* - * now drop our reference to the vnode. if we have the sole - * reference to the vnode then this will cause it to die [as we - * just cleared the persist flag]. we have to unlock the vnode - * while we are doing this as it may trigger I/O. - * - * XXX: it might be possible for uvn to get reclaimed while we are - * unlocked causing us to return TRUE when we should not. we ignore - * this as a false-positive return value doesn't hurt us. - */ - VOP_UNLOCK(vp, 0, curproc); - uvn_detach(&uvn->u_obj); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curproc); - - /* - * and return... - */ - - return(TRUE); + /* mark the page BUSY and we're done. */ + pg->flags |= PG_BUSY; + UVM_PAGE_OWN(pg, "uvn_findpage"); + UVMHIST_LOG(ubchist, "found",0,0,0,0); + break; + } + *pgp = pg; + return 1; } /* @@ -1879,150 +1014,49 @@ uvm_vnp_setsize(vp, newsize) voff_t newsize; { struct uvm_vnode *uvn = &vp->v_uvm; + voff_t pgend = round_page(newsize); + UVMHIST_FUNC("uvm_vnp_setsize"); UVMHIST_CALLED(ubchist); - /* - * lock uvn and check for valid object, and if valid: do it! - */ simple_lock(&uvn->u_obj.vmobjlock); - if (uvn->u_flags & UVM_VNODE_VALID) { - - /* - * now check if the size has changed: if we shrink we had better - * toss some pages... - */ - if (uvn->u_size > newsize) { - (void)uvn_flush(&uvn->u_obj, newsize, - uvn->u_size, PGO_FREE); - } - uvn->u_size = newsize; - } - simple_unlock(&uvn->u_obj.vmobjlock); + UVMHIST_LOG(ubchist, "old 0x%x new 0x%x", uvn->u_size, newsize, 0,0); /* - * done + * now check if the size has changed: if we shrink we had better + * toss some pages... */ - return; + + if (uvn->u_size > pgend && uvn->u_size != VSIZENOTSET) { + (void) uvn_flush(&uvn->u_obj, pgend, 0, PGO_FREE); + } + uvn->u_size = newsize; + simple_unlock(&uvn->u_obj.vmobjlock); } /* - * uvm_vnp_sync: flush all dirty VM pages back to their backing vnodes. - * - * => called from sys_sync with no VM structures locked - * => only one process can do a sync at a time (because the uvn - * structure only has one queue for sync'ing). we ensure this - * by holding the uvn_sync_lock while the sync is in progress. - * other processes attempting a sync will sleep on this lock - * until we are done. + * uvm_vnp_zerorange: set a range of bytes in a file to zero. */ void -uvm_vnp_sync(mp) - struct mount *mp; -{ - struct uvm_vnode *uvn; +uvm_vnp_zerorange(vp, off, len) struct vnode *vp; - boolean_t got_lock; - - /* - * step 1: ensure we are only ones using the uvn_sync_q by locking - * our lock... - */ - lockmgr(&uvn_sync_lock, LK_EXCLUSIVE, NULL, curproc); - - /* - * step 2: build up a simpleq of uvns of interest based on the - * write list. we gain a reference to uvns of interest. must - * be careful about locking uvn's since we will be holding uvn_wl_lock - * in the body of the loop. - */ - SIMPLEQ_INIT(&uvn_sync_q); - simple_lock(&uvn_wl_lock); - for (uvn = uvn_wlist.lh_first ; uvn != NULL ; - uvn = uvn->u_wlist.le_next) { - - vp = (struct vnode *) uvn; - if (mp && vp->v_mount != mp) - continue; - - /* attempt to gain reference */ - while ((got_lock = simple_lock_try(&uvn->u_obj.vmobjlock)) == - FALSE && - (uvn->u_flags & UVM_VNODE_BLOCKED) == 0) - /* spin */ ; - - /* - * we will exit the loop if either if the following are true: - * - we got the lock [always true if NCPU == 1] - * - we failed to get the lock but noticed the vnode was - * "blocked" -- in this case the vnode must be a dying - * vnode, and since dying vnodes are in the process of - * being flushed out, we can safely skip this one - * - * we want to skip over the vnode if we did not get the lock, - * or if the vnode is already dying (due to the above logic). - * - * note that uvn must already be valid because we found it on - * the wlist (this also means it can't be ALOCK'd). - */ - if (!got_lock || (uvn->u_flags & UVM_VNODE_BLOCKED) != 0) { - if (got_lock) - simple_unlock(&uvn->u_obj.vmobjlock); - continue; /* skip it */ - } - - /* - * gain reference. watch out for persisting uvns (need to - * regain vnode REF). - */ - if (uvn->u_obj.uo_refs == 0) - VREF(vp); - uvn->u_obj.uo_refs++; - simple_unlock(&uvn->u_obj.vmobjlock); - - /* - * got it! - */ - SIMPLEQ_INSERT_HEAD(&uvn_sync_q, uvn, u_syncq); - } - simple_unlock(&uvn_wl_lock); + off_t off; + size_t len; +{ + void *win; - /* - * step 3: we now have a list of uvn's that may need cleaning. - * we are holding the uvn_sync_lock, but have dropped the uvn_wl_lock - * (so we can now safely lock uvn's again). - */ + /* + * XXXUBC invent kzero() and use it + */ - for (uvn = uvn_sync_q.sqh_first ; uvn ; uvn = uvn->u_syncq.sqe_next) { - simple_lock(&uvn->u_obj.vmobjlock); -#ifdef DEBUG - if (uvn->u_flags & UVM_VNODE_DYING) { - printf("uvm_vnp_sync: dying vnode on sync list\n"); - } -#endif - uvn_flush(&uvn->u_obj, 0, 0, - PGO_CLEANIT|PGO_ALLPAGES|PGO_DOACTCLUST); + while (len) { + vsize_t bytelen = len; - /* - * if we have the only reference and we just cleaned the uvn, - * then we can pull it out of the UVM_VNODE_WRITEABLE state - * thus allowing us to avoid thinking about flushing it again - * on later sync ops. - */ - if (uvn->u_obj.uo_refs == 1 && - (uvn->u_flags & UVM_VNODE_WRITEABLE)) { - LIST_REMOVE(uvn, u_wlist); - uvn->u_flags &= ~UVM_VNODE_WRITEABLE; - } - - simple_unlock(&uvn->u_obj.vmobjlock); + win = ubc_alloc(&vp->v_uvm.u_obj, off, &bytelen, UBC_WRITE); + memset(win, 0, bytelen); + ubc_release(win, 0); - /* now drop our reference to the uvn */ - uvn_detach(&uvn->u_obj); - } - - /* - * done! release sync lock - */ - lockmgr(&uvn_sync_lock, LK_RELEASE, (void *)0, curproc); + off += bytelen; + len -= bytelen; + } } diff --git a/sys/uvm/uvm_vnode.h b/sys/uvm/uvm_vnode.h index 29efe4d2ac4..ce853189207 100644 --- a/sys/uvm/uvm_vnode.h +++ b/sys/uvm/uvm_vnode.h @@ -1,4 +1,4 @@ -/* $OpenBSD: uvm_vnode.h,v 1.8 2001/08/06 14:03:05 art Exp $ */ +/* $OpenBSD: uvm_vnode.h,v 1.9 2001/11/27 05:27:12 art Exp $ */ /* $NetBSD: uvm_vnode.h,v 1.9 2000/03/26 20:54:48 kleink Exp $ */ /* @@ -55,56 +55,6 @@ struct uvm_vnode { int u_flags; /* flags */ int u_nio; /* number of running I/O requests */ voff_t u_size; /* size of object */ - - /* the following entry is locked by uvn_wl_lock */ - LIST_ENTRY(uvm_vnode) u_wlist; /* list of writeable vnode objects */ - - /* the following entry is locked by uvn_sync_lock */ - SIMPLEQ_ENTRY(uvm_vnode) u_syncq; /* vnode objects due for a "sync" */ }; -/* - * u_flags values - */ -#define UVM_VNODE_VALID 0x001 /* we are attached to the vnode */ -#define UVM_VNODE_CANPERSIST 0x002 /* we can persist after ref == 0 */ -#define UVM_VNODE_ALOCK 0x004 /* uvn_attach is locked out */ -#define UVM_VNODE_DYING 0x008 /* final detach/terminate in - progress */ -#define UVM_VNODE_RELKILL 0x010 /* uvn should be killed by releasepg - when final i/o is done */ -#define UVM_VNODE_WANTED 0x020 /* someone is waiting for alock, - dying, or relkill to clear */ -#define UVM_VNODE_VNISLOCKED 0x040 /* underlying vnode struct is locked - (valid when DYING is true) */ -#define UVM_VNODE_IOSYNC 0x080 /* I/O sync in progress ... setter - sleeps on &uvn->u_nio */ -#define UVM_VNODE_IOSYNCWANTED 0x100 /* a process is waiting for the - i/o sync to clear so it can do - i/o */ -#define UVM_VNODE_WRITEABLE 0x200 /* uvn has pages that are writeable */ - -/* - * UVM_VNODE_BLOCKED: any condition that should new processes from - * touching the vnode [set WANTED and sleep to wait for it to clear] - */ -#define UVM_VNODE_BLOCKED (UVM_VNODE_ALOCK|UVM_VNODE_DYING|UVM_VNODE_RELKILL) - -#ifdef _KERNEL - -/* - * prototypes - */ - -#if 0 -/* - * moved uvn_attach to uvm_extern.h because uvm_vnode.h is needed to - * include sys/vnode.h, and files that include sys/vnode.h don't know - * what a vm_prot_t is. - */ -struct uvm_object *uvn_attach __P((void *, vm_prot_t)); -#endif - -#endif /* _KERNEL */ - #endif /* _UVM_UVM_VNODE_H_ */ |