diff options
author | Artur Grabowski <art@cvs.openbsd.org> | 2001-11-27 05:27:13 +0000 |
---|---|---|
committer | Artur Grabowski <art@cvs.openbsd.org> | 2001-11-27 05:27:13 +0000 |
commit | 8a1845e49f56720cbfccd4c7f5f80ba5b980fdf4 (patch) | |
tree | d4a522dc41cdc79ba48fe761e94663b795da8cc0 /sys/nfs/nfs_bio.c | |
parent | 0d68e9b5af14f4bfa04d22dbebab5972ac647b26 (diff) |
Merge in the unified buffer cache code as found in NetBSD 2001/03/10. The
code is written mostly by Chuck Silvers <chuq@chuq.com>/<chs@netbsd.org>.
Tested for the past few weeks by many developers, should be in a pretty stable
state, but will require optimizations and additional cleanups.
Diffstat (limited to 'sys/nfs/nfs_bio.c')
-rw-r--r-- | sys/nfs/nfs_bio.c | 928 |
1 files changed, 672 insertions, 256 deletions
diff --git a/sys/nfs/nfs_bio.c b/sys/nfs/nfs_bio.c index 1f33bc2eab7..42b25763a88 100644 --- a/sys/nfs/nfs_bio.c +++ b/sys/nfs/nfs_bio.c @@ -1,4 +1,4 @@ -/* $OpenBSD: nfs_bio.c,v 1.24 2001/11/15 23:15:15 art Exp $ */ +/* $OpenBSD: nfs_bio.c,v 1.25 2001/11/27 05:27:12 art Exp $ */ /* $NetBSD: nfs_bio.c,v 1.25.4.2 1996/07/08 20:47:04 jtc Exp $ */ /* @@ -50,8 +50,9 @@ #include <sys/mount.h> #include <sys/kernel.h> #include <sys/namei.h> +#include <sys/pool.h> -#include <uvm/uvm_extern.h> +#include <uvm/uvm.h> #include <nfs/rpcv2.h> #include <nfs/nfsproto.h> @@ -70,20 +71,19 @@ struct nfsstats nfsstats; */ int nfs_bioread(vp, uio, ioflag, cred) - register struct vnode *vp; - register struct uio *uio; + struct vnode *vp; + struct uio *uio; int ioflag; struct ucred *cred; { - register struct nfsnode *np = VTONFS(vp); - register int biosize, diff; - struct buf *bp = NULL, *rabp; + struct nfsnode *np = VTONFS(vp); + int biosize; + struct buf *bp = NULL; struct vattr vattr; struct proc *p; struct nfsmount *nmp = VFSTONFS(vp->v_mount); - daddr_t lbn, bn, rabn; caddr_t baddr; - int got_buf = 0, nra, error = 0, n = 0, on = 0, not_readin; + int got_buf = 0, error = 0, n = 0, on = 0; #ifdef DIAGNOSTIC if (uio->uio_rw != UIO_READ) @@ -153,87 +153,25 @@ nfs_bioread(vp, uio, ioflag, cred) switch (vp->v_type) { case VREG: nfsstats.biocache_reads++; - lbn = uio->uio_offset / biosize; - on = uio->uio_offset & (biosize - 1); - bn = lbn * (biosize / DEV_BSIZE); - not_readin = 1; - - /* - * Start the read ahead(s), as required. - */ - if (nfs_numasync > 0 && nmp->nm_readahead > 0) { - for (nra = 0; nra < nmp->nm_readahead && - (lbn + 1 + nra) * biosize < np->n_size; nra++) { - rabn = (lbn + 1 + nra) * (biosize / DEV_BSIZE); - if (!incore(vp, rabn)) { - rabp = nfs_getcacheblk(vp, rabn, biosize, p); - if (!rabp) - return (EINTR); - if ((rabp->b_flags & (B_DELWRI | B_DONE)) == 0) { - rabp->b_flags |= (B_READ | B_ASYNC); - if (nfs_asyncio(rabp)) { - rabp->b_flags |= B_INVAL; - brelse(rabp); - } - } else - brelse(rabp); - } - } - } + error = 0; + while (uio->uio_resid > 0) { + void *win; + vsize_t bytelen = MIN(np->n_size - uio->uio_offset, + uio->uio_resid); - /* - * If the block is in the cache and has the required data - * in a valid region, just copy it out. - * Otherwise, get the block and write back/read in, - * as required. - */ - if ((bp = incore(vp, bn)) && - (bp->b_flags & (B_BUSY | B_WRITEINPROG)) == - (B_BUSY | B_WRITEINPROG)) - got_buf = 0; - else { -again: - bp = nfs_getcacheblk(vp, bn, biosize, p); - if (!bp) - return (EINTR); - got_buf = 1; - if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0) { - bp->b_flags |= B_READ; - not_readin = 0; - error = nfs_doio(bp, p); - if (error) { - brelse(bp); - return (error); - } - } - } - n = min((unsigned)(biosize - on), uio->uio_resid); - diff = np->n_size - uio->uio_offset; - if (diff < n) - n = diff; - if (not_readin && n > 0) { - if (on < bp->b_validoff || (on + n) > bp->b_validend) { - if (!got_buf) { - bp = nfs_getcacheblk(vp, bn, biosize, p); - if (!bp) - return (EINTR); - got_buf = 1; - } - bp->b_flags |= B_INVAFTERWRITE; - if (bp->b_dirtyend > 0) { - if ((bp->b_flags & B_DELWRI) == 0) - panic("nfsbioread"); - if (VOP_BWRITE(bp) == EINTR) - return (EINTR); - } else - brelse(bp); - goto again; + if (bytelen == 0) + break; + win = ubc_alloc(&vp->v_uvm.u_obj, uio->uio_offset, + &bytelen, UBC_READ); + error = uiomove(win, bytelen, uio); + ubc_release(win, 0); + if (error) { + break; } } - diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on); - if (diff < n) - n = diff; + n = 0; break; + case VLNK: nfsstats.biocache_readlinks++; bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p); @@ -247,7 +185,7 @@ again: return (error); } } - n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid); + n = MIN(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid); got_buf = 1; on = 0; break; @@ -289,18 +227,17 @@ nfs_write(v) int a_ioflag; struct ucred *a_cred; } */ *ap = v; - register int biosize; - register struct uio *uio = ap->a_uio; + int biosize; + struct uio *uio = ap->a_uio; struct proc *p = uio->uio_procp; - register struct vnode *vp = ap->a_vp; + struct vnode *vp = ap->a_vp; struct nfsnode *np = VTONFS(vp); - register struct ucred *cred = ap->a_cred; + struct ucred *cred = ap->a_cred; int ioflag = ap->a_ioflag; - struct buf *bp; struct vattr vattr; struct nfsmount *nmp = VFSTONFS(vp->v_mount); - daddr_t lbn, bn; - int n, on, error = 0; + int error = 0; + int rv; #ifdef DIAGNOSTIC if (uio->uio_rw != UIO_WRITE) @@ -360,85 +297,47 @@ nfs_write(v) */ biosize = nmp->nm_rsize; do { - - /* - * XXX make sure we aren't cached in the VM page cache - */ - uvm_vnp_uncache(vp); + void *win; + voff_t oldoff = uio->uio_offset; + vsize_t bytelen = uio->uio_resid; nfsstats.biocache_writes++; - lbn = uio->uio_offset / biosize; - on = uio->uio_offset & (biosize-1); - n = min((unsigned)(biosize - on), uio->uio_resid); - bn = lbn * (biosize / DEV_BSIZE); -again: - bp = nfs_getcacheblk(vp, bn, biosize, p); - if (!bp) - return (EINTR); np->n_flag |= NMODIFIED; - if (uio->uio_offset + n > np->n_size) { - np->n_size = uio->uio_offset + n; - uvm_vnp_setsize(vp, (u_long)np->n_size); - } - - /* - * If the new write will leave a contiguous dirty - * area, just update the b_dirtyoff and b_dirtyend, - * otherwise force a write rpc of the old dirty area. - */ - if (bp->b_dirtyend > 0 && - (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) { - bp->b_proc = p; - if (VOP_BWRITE(bp) == EINTR) - return (EINTR); - goto again; - } - - error = uiomove((char *)bp->b_data + on, n, uio); - if (error) { - bp->b_flags |= B_ERROR; - brelse(bp); - return (error); + if (np->n_size < uio->uio_offset + bytelen) { + np->n_size = uio->uio_offset + bytelen; + uvm_vnp_setsize(vp, np->n_size); } - if (bp->b_dirtyend > 0) { - bp->b_dirtyoff = min(on, bp->b_dirtyoff); - bp->b_dirtyend = max((on + n), bp->b_dirtyend); - } else { - bp->b_dirtyoff = on; - bp->b_dirtyend = on + n; + win = ubc_alloc(&vp->v_uvm.u_obj, uio->uio_offset, &bytelen, + UBC_WRITE); + error = uiomove(win, bytelen, uio); + ubc_release(win, 0); + rv = 1; + if ((ioflag & IO_SYNC)) { + simple_lock(&vp->v_uvm.u_obj.vmobjlock); + rv = vp->v_uvm.u_obj.pgops->pgo_flush( + &vp->v_uvm.u_obj, + oldoff & ~(nmp->nm_wsize - 1), + uio->uio_offset & ~(nmp->nm_wsize - 1), + PGO_CLEANIT|PGO_SYNCIO); + simple_unlock(&vp->v_uvm.u_obj.vmobjlock); + } else if ((oldoff & ~(nmp->nm_wsize - 1)) != + (uio->uio_offset & ~(nmp->nm_wsize - 1))) { + simple_lock(&vp->v_uvm.u_obj.vmobjlock); + rv = vp->v_uvm.u_obj.pgops->pgo_flush( + &vp->v_uvm.u_obj, + oldoff & ~(nmp->nm_wsize - 1), + uio->uio_offset & ~(nmp->nm_wsize - 1), + PGO_CLEANIT|PGO_WEAK); + simple_unlock(&vp->v_uvm.u_obj.vmobjlock); } - if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff || - bp->b_validoff > bp->b_dirtyend) { - bp->b_validoff = bp->b_dirtyoff; - bp->b_validend = bp->b_dirtyend; - } else { - bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff); - bp->b_validend = max(bp->b_validend, bp->b_dirtyend); + if (!rv) { + error = EIO; } - - /* - * Since this block is being modified, it must be written - * again and not just committed. - */ - bp->b_flags &= ~B_NEEDCOMMIT; - - /* - * If the lease is non-cachable or IO_SYNC do bwrite(). - */ - if (ioflag & IO_SYNC) { - bp->b_proc = p; - error = VOP_BWRITE(bp); - if (error) - return (error); - } else if ((n + on) == biosize) { - bp->b_proc = (struct proc *)0; - bp->b_flags |= B_ASYNC; - (void)nfs_writebp(bp, 0); - } else { - bdwrite(bp); + if (error) { + break; } - } while (uio->uio_resid > 0 && n > 0); - return (0); + } while (uio->uio_resid > 0); + return (error); } /* @@ -460,9 +359,9 @@ nfs_getcacheblk(vp, bn, size, p) if (nmp->nm_flag & NFSMNT_INT) { bp = getblk(vp, bn, size, PCATCH, 0); - while (bp == (struct buf *)0) { - if (nfs_sigintr(nmp, (struct nfsreq *)0, p)) - return ((struct buf *)0); + while (bp == NULL) { + if (nfs_sigintr(nmp, NULL, p)) + return (NULL); bp = getblk(vp, bn, size, 0, 2 * hz); } } else @@ -502,7 +401,7 @@ nfs_vinvalbuf(vp, flags, cred, p, intrflg) np->n_flag |= NFLUSHWANT; error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval", slptimeo); - if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) + if (error && intrflg && nfs_sigintr(nmp, NULL, p)) return (EINTR); } @@ -512,7 +411,7 @@ nfs_vinvalbuf(vp, flags, cred, p, intrflg) np->n_flag |= NFLUSHINPROG; error = vinvalbuf(vp, flags, cred, p, slpflag, 0); while (error) { - if (intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) { + if (intrflg && nfs_sigintr(nmp, NULL, p)) { np->n_flag &= ~NFLUSHINPROG; if (np->n_flag & NFLUSHWANT) { np->n_flag &= ~NFLUSHWANT; @@ -539,41 +438,20 @@ int nfs_asyncio(bp) struct buf *bp; { - int i,s; + int i; if (nfs_numasync == 0) return (EIO); - for (i = 0; i < NFS_MAXASYNCDAEMON; i++) + for (i = 0; i < NFS_MAXASYNCDAEMON; i++) { if (nfs_iodwant[i]) { - if ((bp->b_flags & B_READ) == 0) { - bp->b_flags |= B_WRITEINPROG; - } - TAILQ_INSERT_TAIL(&nfs_bufq, bp, b_freelist); - nfs_iodwant[i] = (struct proc *)0; + nfs_iodwant[i] = NULL; wakeup((caddr_t)&nfs_iodwant[i]); return (0); } + } - /* - * If it is a read or a write already marked B_WRITEINPROG or B_NOCACHE - * return EIO so the process will call nfs_doio() and do it - * synchronously. - */ - if (bp->b_flags & (B_READ | B_WRITEINPROG | B_NOCACHE)) - return (EIO); - - /* - * Just turn the async write into a delayed write, instead of - * doing in synchronously. Hopefully, at least one of the nfsiods - * is currently doing a write for this file and will pick up the - * delayed writes before going back to sleep. - */ - s = splbio(); - buf_dirty(bp); - splx(s); - biodone(bp); - return (0); + return (EIO); } /* @@ -589,7 +467,7 @@ nfs_doio(bp, p) register struct vnode *vp; struct nfsnode *np; struct nfsmount *nmp; - int s, error = 0, diff, len, iomode, must_commit = 0; + int error = 0, diff, len, iomode, must_commit = 0; struct uio uio; struct iovec io; @@ -636,9 +514,7 @@ nfs_doio(bp, p) uiop->uio_offset = ((off_t)bp->b_blkno) << DEV_BSHIFT; nfsstats.read_bios++; error = nfs_readrpc(vp, uiop); - if (!error) { - bp->b_validoff = 0; - if (uiop->uio_resid) { + if (!error && uiop->uio_resid) { /* * If len > 0, there is a hole in the file and * no writes after the hole have been pushed to @@ -649,13 +525,9 @@ nfs_doio(bp, p) len = np->n_size - ((((off_t)bp->b_blkno) << DEV_BSHIFT) + diff); if (len > 0) { - len = min(len, uiop->uio_resid); - bzero((char *)bp->b_data + diff, len); - bp->b_validend = diff + len; - } else - bp->b_validend = diff; - } else - bp->b_validend = bp->b_bcount; + len = MIN(len, uiop->uio_resid); + memset((char *)bp->b_data + diff, 0, len); + } } if (p && (vp->v_flag & VTEXT) && (np->n_mtime != np->n_vattr.va_mtime.tv_sec)) { @@ -672,62 +544,19 @@ nfs_doio(bp, p) default: printf("nfs_doio: type %x unexpected\n",vp->v_type); break; - }; + } if (error) { bp->b_flags |= B_ERROR; bp->b_error = error; } } else { - io.iov_len = uiop->uio_resid = bp->b_dirtyend - - bp->b_dirtyoff; - uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE - + bp->b_dirtyoff; - io.iov_base = (char *)bp->b_data + bp->b_dirtyoff; + io.iov_base = bp->b_data; + io.iov_len = uiop->uio_resid = bp->b_bcount; + uiop->uio_offset = ((off_t)bp->b_blkno) << DEV_BSHIFT; uiop->uio_rw = UIO_WRITE; nfsstats.write_bios++; - if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE)) == B_ASYNC) - iomode = NFSV3WRITE_UNSTABLE; - else - iomode = NFSV3WRITE_FILESYNC; - bp->b_flags |= B_WRITEINPROG; -#ifdef fvdl_debug - printf("nfs_doio(%x): bp %x doff %d dend %d\n", - vp, bp, bp->b_dirtyoff, bp->b_dirtyend); -#endif + iomode = NFSV3WRITE_UNSTABLE; error = nfs_writerpc(vp, uiop, &iomode, &must_commit); - if (!error && iomode == NFSV3WRITE_UNSTABLE) - bp->b_flags |= B_NEEDCOMMIT; - else - bp->b_flags &= ~B_NEEDCOMMIT; - bp->b_flags &= ~B_WRITEINPROG; - - /* - * For an interrupted write, the buffer is still valid and the - * write hasn't been pushed to the server yet, so we can't set - * B_ERROR and report the interruption by setting B_EINTR. For - * the B_ASYNC case, B_EINTR is not relevant, so the rpc attempt - * is essentially a noop. - * For the case of a V3 write rpc not being committed to stable - * storage, the block is still dirty and requires either a commit - * rpc or another write rpc with iomode == NFSV3WRITE_FILESYNC - * before the block is reused. This is indicated by setting the - * B_DELWRI and B_NEEDCOMMIT flags. - */ - if (error == EINTR || (!error && (bp->b_flags & B_NEEDCOMMIT))) { - s = splbio(); - buf_dirty(bp); - splx(s); - - if (!(bp->b_flags & B_ASYNC) && error) - bp->b_flags |= B_EINTR; - } else { - if (error) { - bp->b_flags |= B_ERROR; - bp->b_error = np->n_error = error; - np->n_flag |= NWRITEERR; - } - bp->b_dirtyoff = bp->b_dirtyend = 0; - } } bp->b_resid = uiop->uio_resid; if (must_commit) @@ -735,3 +564,590 @@ nfs_doio(bp, p) biodone(bp); return (error); } + +/* + * Vnode op for VM getpages. + */ +int +nfs_getpages(v) + void *v; +{ + struct vop_getpages_args /* { + struct vnode *a_vp; + voff_t a_offset; + vm_page_t *a_m; + int *a_count; + int a_centeridx; + vm_prot_t a_access_type; + int a_advice; + int a_flags; + } */ *ap = v; + + off_t eof, offset, origoffset, startoffset, endoffset; + int s, i, error, npages, orignpages, npgs, ridx, pidx, pcount; + vaddr_t kva; + struct buf *bp, *mbp; + struct vnode *vp = ap->a_vp; + struct nfsnode *np = VTONFS(vp); + struct uvm_object *uobj = &vp->v_uvm.u_obj; + struct nfsmount *nmp = VFSTONFS(vp->v_mount); + size_t bytes, iobytes, tailbytes, totalbytes, skipbytes; + int flags = ap->a_flags; + int bsize; + struct vm_page *pgs[16]; /* XXXUBC 16 */ + boolean_t v3 = NFS_ISV3(vp); + boolean_t async = (flags & PGO_SYNCIO) == 0; + boolean_t write = (ap->a_access_type & VM_PROT_WRITE) != 0; + struct proc *p = curproc; + + UVMHIST_FUNC("nfs_getpages"); UVMHIST_CALLED(ubchist); + UVMHIST_LOG(ubchist, "vp %p off 0x%x count %d", vp, (int)ap->a_offset, + *ap->a_count,0); + +#ifdef DIAGNOSTIC + if (ap->a_centeridx < 0 || ap->a_centeridx >= *ap->a_count) { + panic("nfs_getpages: centeridx %d out of range", + ap->a_centeridx); + } +#endif + + error = 0; + origoffset = ap->a_offset; + eof = vp->v_uvm.u_size; + if (origoffset >= eof) { + if ((flags & PGO_LOCKED) == 0) { + simple_unlock(&uobj->vmobjlock); + } + UVMHIST_LOG(ubchist, "off 0x%x past EOF 0x%x", + (int)origoffset, (int)eof,0,0); + return EINVAL; + } + + if (flags & PGO_LOCKED) { + uvn_findpages(uobj, origoffset, ap->a_count, ap->a_m, + UFP_NOWAIT|UFP_NOALLOC); + return 0; + } + + /* vnode is VOP_LOCKed, uobj is locked */ + + bsize = nmp->nm_rsize; + orignpages = MIN(*ap->a_count, + round_page(eof - origoffset) >> PAGE_SHIFT); + npages = orignpages; + startoffset = origoffset & ~(bsize - 1); + endoffset = round_page((origoffset + (npages << PAGE_SHIFT) + + bsize - 1) & ~(bsize - 1)); + endoffset = MIN(endoffset, round_page(eof)); + ridx = (origoffset - startoffset) >> PAGE_SHIFT; + + if (!async && !write) { + int rapages = MAX(PAGE_SIZE, nmp->nm_rsize) >> PAGE_SHIFT; + + (void) VOP_GETPAGES(vp, endoffset, NULL, &rapages, 0, + VM_PROT_READ, 0, 0); + simple_lock(&uobj->vmobjlock); + } + + UVMHIST_LOG(ubchist, "npages %d offset 0x%x", npages, + (int)origoffset, 0,0); + memset(pgs, 0, sizeof(pgs)); + uvn_findpages(uobj, origoffset, &npages, &pgs[ridx], UFP_ALL); + + if (flags & PGO_OVERWRITE) { + UVMHIST_LOG(ubchist, "PGO_OVERWRITE",0,0,0,0); + + /* XXXUBC for now, zero the page if we allocated it */ + for (i = 0; i < npages; i++) { + struct vm_page *pg = pgs[ridx + i]; + + if (pg->flags & PG_FAKE) { + uvm_pagezero(pg); + pg->flags &= ~(PG_FAKE); + } + } + npages += ridx; + if (v3) { + simple_unlock(&uobj->vmobjlock); + goto uncommit; + } + goto out; + } + + /* + * if the pages are already resident, just return them. + */ + + for (i = 0; i < npages; i++) { + struct vm_page *pg = pgs[ridx + i]; + + if ((pg->flags & PG_FAKE) != 0 || + ((ap->a_access_type & VM_PROT_WRITE) && + (pg->flags & PG_RDONLY))) { + break; + } + } + if (i == npages) { + UVMHIST_LOG(ubchist, "returning cached pages", 0,0,0,0); + npages += ridx; + goto out; + } + + /* + * the page wasn't resident and we're not overwriting, + * so we're going to have to do some i/o. + * find any additional pages needed to cover the expanded range. + */ + + if (startoffset != origoffset || + startoffset + (npages << PAGE_SHIFT) != endoffset) { + + /* + * XXXUBC we need to avoid deadlocks caused by locking + * additional pages at lower offsets than pages we + * already have locked. for now, unlock them all and + * start over. + */ + + for (i = 0; i < npages; i++) { + struct vm_page *pg = pgs[ridx + i]; + + if (pg->flags & PG_FAKE) { + pg->flags |= PG_RELEASED; + } + } + uvm_page_unbusy(&pgs[ridx], npages); + memset(pgs, 0, sizeof(pgs)); + + UVMHIST_LOG(ubchist, "reset npages start 0x%x end 0x%x", + startoffset, endoffset, 0,0); + npages = (endoffset - startoffset) >> PAGE_SHIFT; + npgs = npages; + uvn_findpages(uobj, startoffset, &npgs, pgs, UFP_ALL); + } + simple_unlock(&uobj->vmobjlock); + + /* + * update the cached read creds for this node. + */ + + if (np->n_rcred) { + crfree(np->n_rcred); + } + np->n_rcred = curproc->p_ucred; + crhold(np->n_rcred); + + /* + * read the desired page(s). + */ + + totalbytes = npages << PAGE_SHIFT; + bytes = MIN(totalbytes, vp->v_uvm.u_size - startoffset); + tailbytes = totalbytes - bytes; + skipbytes = 0; + + kva = uvm_pagermapin(pgs, npages, UVMPAGER_MAPIN_WAITOK | + UVMPAGER_MAPIN_READ); + + s = splbio(); + mbp = pool_get(&bufpool, PR_WAITOK); + splx(s); + mbp->b_bufsize = totalbytes; + mbp->b_data = (void *)kva; + mbp->b_resid = mbp->b_bcount = bytes; + mbp->b_flags = B_BUSY|B_READ| (async ? B_CALL|B_ASYNC : 0); + mbp->b_iodone = uvm_aio_biodone; + mbp->b_vp = vp; + mbp->b_proc = NULL; /* XXXUBC */ + LIST_INIT(&mbp->b_dep); + + /* + * if EOF is in the middle of the last page, zero the part past EOF. + */ + + if (tailbytes > 0 && (pgs[bytes >> PAGE_SHIFT]->flags & PG_FAKE)) { + memset((char *)kva + bytes, 0, tailbytes); + } + + /* + * now loop over the pages, reading as needed. + */ + + bp = NULL; + for (offset = startoffset; + bytes > 0; + offset += iobytes, bytes -= iobytes) { + + /* + * skip pages which don't need to be read. + */ + + pidx = (offset - startoffset) >> PAGE_SHIFT; + UVMHIST_LOG(ubchist, "pidx %d offset 0x%x startoffset 0x%x", + pidx, (int)offset, (int)startoffset,0); + while ((pgs[pidx]->flags & PG_FAKE) == 0) { + size_t b; + + KASSERT((offset & (PAGE_SIZE - 1)) == 0); + b = MIN(PAGE_SIZE, bytes); + offset += b; + bytes -= b; + skipbytes += b; + pidx++; + UVMHIST_LOG(ubchist, "skipping, new offset 0x%x", + (int)offset, 0,0,0); + if (bytes == 0) { + goto loopdone; + } + } + + /* + * see how many pages can be read with this i/o. + * reduce the i/o size if necessary. + */ + + iobytes = bytes; + if (offset + iobytes > round_page(offset)) { + pcount = 1; + while (pidx + pcount < npages && + pgs[pidx + pcount]->flags & PG_FAKE) { + pcount++; + } + iobytes = MIN(iobytes, (pcount << PAGE_SHIFT) - + (offset - trunc_page(offset))); + } + iobytes = MIN(iobytes, nmp->nm_rsize); + + /* + * allocate a sub-buf for this piece of the i/o + * (or just use mbp if there's only 1 piece), + * and start it going. + */ + + if (offset == startoffset && iobytes == bytes) { + bp = mbp; + } else { + s = splbio(); + bp = pool_get(&bufpool, PR_WAITOK); + splx(s); + bp->b_data = (char *)kva + offset - startoffset; + bp->b_resid = bp->b_bcount = iobytes; + bp->b_flags = B_BUSY|B_READ|B_CALL|B_ASYNC; + bp->b_iodone = uvm_aio_biodone1; + bp->b_vp = vp; + bp->b_proc = NULL; /* XXXUBC */ + LIST_INIT(&bp->b_dep); + } + bp->b_private = mbp; + bp->b_lblkno = bp->b_blkno = offset >> DEV_BSHIFT; + + UVMHIST_LOG(ubchist, "bp %p offset 0x%x bcount 0x%x blkno 0x%x", + bp, offset, iobytes, bp->b_blkno); + + VOP_STRATEGY(bp); + } + +loopdone: + if (skipbytes) { + s = splbio(); + mbp->b_resid -= skipbytes; + if (mbp->b_resid == 0) { + biodone(mbp); + } + splx(s); + } + if (async) { + UVMHIST_LOG(ubchist, "returning PEND",0,0,0,0); + return EINPROGRESS; + } + if (bp != NULL) { + error = biowait(mbp); + } + s = splbio(); + pool_put(&bufpool, mbp); + splx(s); + uvm_pagermapout(kva, npages); + + if (write && v3) { +uncommit: + lockmgr(&np->n_commitlock, LK_EXCLUSIVE, NULL, p); + nfs_del_committed_range(vp, origoffset, npages); + nfs_del_tobecommitted_range(vp, origoffset, npages); + simple_lock(&uobj->vmobjlock); + for (i = 0; i < npages; i++) { + if (pgs[i] == NULL) { + continue; + } + pgs[i]->flags &= ~(PG_NEEDCOMMIT|PG_RDONLY); + } + simple_unlock(&uobj->vmobjlock); + lockmgr(&np->n_commitlock, LK_RELEASE, NULL, p); + } + + simple_lock(&uobj->vmobjlock); + +out: + if (error) { + uvm_lock_pageq(); + for (i = 0; i < npages; i++) { + if (pgs[i] == NULL) { + continue; + } + UVMHIST_LOG(ubchist, "examining pg %p flags 0x%x", + pgs[i], pgs[i]->flags, 0,0); + if (pgs[i]->flags & PG_WANTED) { + wakeup(pgs[i]); + } + if (pgs[i]->flags & PG_RELEASED) { + uvm_unlock_pageq(); + (uobj->pgops->pgo_releasepg)(pgs[i], NULL); + uvm_lock_pageq(); + continue; + } + if (pgs[i]->flags & PG_FAKE) { + uvm_pagefree(pgs[i]); + continue; + } + uvm_pageactivate(pgs[i]); + pgs[i]->flags &= ~(PG_WANTED|PG_BUSY); + UVM_PAGE_OWN(pgs[i], NULL); + } + uvm_unlock_pageq(); + simple_unlock(&uobj->vmobjlock); + UVMHIST_LOG(ubchist, "returning error %d", error,0,0,0); + return error; + } + + UVMHIST_LOG(ubchist, "ridx %d count %d", ridx, npages, 0,0); + uvm_lock_pageq(); + for (i = 0; i < npages; i++) { + if (pgs[i] == NULL) { + continue; + } + UVMHIST_LOG(ubchist, "examining pg %p flags 0x%x", + pgs[i], pgs[i]->flags, 0,0); + if (pgs[i]->flags & PG_FAKE) { + UVMHIST_LOG(ubchist, "unfaking pg %p offset 0x%x", + pgs[i], (int)pgs[i]->offset,0,0); + pgs[i]->flags &= ~(PG_FAKE); + pmap_clear_modify(pgs[i]); + pmap_clear_reference(pgs[i]); + } + if (i < ridx || i >= ridx + orignpages || async) { + UVMHIST_LOG(ubchist, "unbusy pg %p offset 0x%x", + pgs[i], (int)pgs[i]->offset,0,0); + if (pgs[i]->flags & PG_WANTED) { + wakeup(pgs[i]); + } + if (pgs[i]->flags & PG_RELEASED) { + uvm_unlock_pageq(); + (uobj->pgops->pgo_releasepg)(pgs[i], NULL); + uvm_lock_pageq(); + continue; + } + uvm_pageactivate(pgs[i]); + pgs[i]->flags &= ~(PG_WANTED|PG_BUSY); + UVM_PAGE_OWN(pgs[i], NULL); + } + } + uvm_unlock_pageq(); + simple_unlock(&uobj->vmobjlock); + if (ap->a_m != NULL) { + memcpy(ap->a_m, &pgs[ridx], + *ap->a_count * sizeof(struct vm_page *)); + } + return 0; +} + +/* + * Vnode op for VM putpages. + */ +int +nfs_putpages(v) + void *v; +{ + struct vop_putpages_args /* { + struct vnode *a_vp; + struct vm_page **a_m; + int a_count; + int a_flags; + int *a_rtvals; + } */ *ap = v; + + struct vnode *vp = ap->a_vp; + struct nfsnode *np = VTONFS(vp); + struct nfsmount *nmp = VFSTONFS(vp->v_mount); + struct buf *bp, *mbp; + struct vm_page **pgs = ap->a_m; + int flags = ap->a_flags; + int npages = ap->a_count; + int s, error, i; + size_t bytes, iobytes, skipbytes; + vaddr_t kva; + off_t offset, origoffset, commitoff; + uint32_t commitbytes; + boolean_t v3 = NFS_ISV3(vp); + boolean_t async = (flags & PGO_SYNCIO) == 0; + boolean_t weak = (flags & PGO_WEAK) && v3; + struct proc *p = curproc; + UVMHIST_FUNC("nfs_putpages"); UVMHIST_CALLED(ubchist); + + UVMHIST_LOG(ubchist, "vp %p pgp %p count %d", + vp, ap->a_m, ap->a_count,0); + + simple_unlock(&vp->v_uvm.u_obj.vmobjlock); + + error = 0; + origoffset = pgs[0]->offset; + bytes = MIN(ap->a_count << PAGE_SHIFT, vp->v_uvm.u_size - origoffset); + skipbytes = 0; + + /* + * if the range has been committed already, mark the pages thus. + * if the range just needs to be committed, we're done + * if it's a weak putpage, otherwise commit the range. + */ + + if (v3) { + lockmgr(&np->n_commitlock, LK_EXCLUSIVE, NULL, p); + if (nfs_in_committed_range(vp, origoffset, bytes)) { + goto committed; + } + if (nfs_in_tobecommitted_range(vp, origoffset, bytes)) { + if (weak) { + lockmgr(&np->n_commitlock, LK_RELEASE, NULL, p); + return 0; + } else { + commitoff = np->n_pushlo; + commitbytes = (uint32_t)(np->n_pushhi - + np->n_pushlo); + goto commit; + } + } + lockmgr(&np->n_commitlock, LK_RELEASE, NULL, p); + } + + /* + * otherwise write or commit all the pages. + */ + + kva = uvm_pagermapin(pgs, ap->a_count, UVMPAGER_MAPIN_WAITOK| + UVMPAGER_MAPIN_WRITE); + + s = splbio(); + vp->v_numoutput += 2; + mbp = pool_get(&bufpool, PR_WAITOK); + UVMHIST_LOG(ubchist, "vp %p mbp %p num now %d bytes 0x%x", + vp, mbp, vp->v_numoutput, bytes); + splx(s); + mbp->b_bufsize = npages << PAGE_SHIFT; + mbp->b_data = (void *)kva; + mbp->b_resid = mbp->b_bcount = bytes; + mbp->b_flags = B_BUSY|B_WRITE|B_AGE | + (async ? B_CALL|B_ASYNC : 0) | + (curproc == uvm.pagedaemon_proc ? B_PDAEMON : 0); + mbp->b_iodone = uvm_aio_biodone; + mbp->b_vp = vp; + mbp->b_proc = NULL; /* XXXUBC */ + LIST_INIT(&mbp->b_dep); + + for (offset = origoffset; + bytes > 0; + offset += iobytes, bytes -= iobytes) { + iobytes = MIN(nmp->nm_wsize, bytes); + + /* + * skip writing any pages which only need a commit. + */ + + if ((pgs[(offset - origoffset) >> PAGE_SHIFT]->flags & + PG_NEEDCOMMIT) != 0) { + KASSERT((offset & (PAGE_SIZE - 1)) == 0); + iobytes = MIN(PAGE_SIZE, bytes); + skipbytes += iobytes; + continue; + } + + /* if it's really one i/o, don't make a second buf */ + if (offset == origoffset && iobytes == bytes) { + bp = mbp; + } else { + s = splbio(); + vp->v_numoutput++; + bp = pool_get(&bufpool, PR_WAITOK); + UVMHIST_LOG(ubchist, "vp %p bp %p num now %d", + vp, bp, vp->v_numoutput, 0); + splx(s); + bp->b_data = (char *)kva + (offset - origoffset); + bp->b_resid = bp->b_bcount = iobytes; + bp->b_flags = B_BUSY|B_WRITE|B_CALL|B_ASYNC; + bp->b_iodone = uvm_aio_biodone1; + bp->b_vp = vp; + bp->b_proc = NULL; /* XXXUBC */ + LIST_INIT(&bp->b_dep); + } + bp->b_private = mbp; + bp->b_lblkno = bp->b_blkno = (daddr_t)(offset >> DEV_BSHIFT); + UVMHIST_LOG(ubchist, "bp %p numout %d", + bp, vp->v_numoutput,0,0); + VOP_STRATEGY(bp); + } + if (skipbytes) { + UVMHIST_LOG(ubchist, "skipbytes %d", bytes, 0,0,0); + s = splbio(); + mbp->b_resid -= skipbytes; + if (mbp->b_resid == 0) { + biodone(mbp); + } + splx(s); + } + if (async) { + return EINPROGRESS; + } + if (bp != NULL) { + error = biowait(mbp); + } + + s = splbio(); + if (mbp->b_vp) + vwakeup(mbp->b_vp); + pool_put(&bufpool, mbp); + splx(s); + + uvm_pagermapout(kva, ap->a_count); + if (error || !v3) { + UVMHIST_LOG(ubchist, "returning error %d", error, 0,0,0); + return error; + } + + /* + * for a weak put, mark the range as "to be committed" + * and mark the pages read-only so that we will be notified + * to remove the pages from the "to be committed" range + * if they are made dirty again. + * for a strong put, commit the pages and remove them from the + * "to be committed" range. also, mark them as writable + * and not cleanable with just a commit. + */ + + lockmgr(&np->n_commitlock, LK_EXCLUSIVE, NULL, p); + if (weak) { + nfs_add_tobecommitted_range(vp, origoffset, + npages << PAGE_SHIFT); + for (i = 0; i < npages; i++) { + pgs[i]->flags |= PG_NEEDCOMMIT|PG_RDONLY; + } + } else { + commitoff = origoffset; + commitbytes = npages << PAGE_SHIFT; +commit: + error = nfs_commit(vp, commitoff, commitbytes, curproc); + nfs_del_tobecommitted_range(vp, commitoff, commitbytes); +committed: + for (i = 0; i < npages; i++) { + pgs[i]->flags &= ~(PG_NEEDCOMMIT|PG_RDONLY); + } + } + lockmgr(&np->n_commitlock, LK_RELEASE, NULL, p); + return error; +} |