diff options
-rw-r--r-- | sys/kern/vfs_bio.c | 57 | ||||
-rw-r--r-- | sys/kern/vfs_cluster.c | 12 | ||||
-rw-r--r-- | sys/kern/vfs_subr.c | 290 | ||||
-rw-r--r-- | sys/nfsclient/nfs_bio.c | 4 | ||||
-rw-r--r-- | sys/nfsserver/nfs_serv.c | 8 | ||||
-rw-r--r-- | sys/sys/buf.h | 9 | ||||
-rw-r--r-- | sys/sys/vnode.h | 6 |
7 files changed, 284 insertions, 102 deletions
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index 1f86b68..a3639b4 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -189,6 +189,7 @@ static int runningbufreq; */ static int needsbuffer; +#ifdef USE_BUFHASH /* * Mask for index into the buffer hash table, which needs to be power of 2 in * size. Set in kern_vfs_bio_buffer_alloc. @@ -208,6 +209,8 @@ static LIST_HEAD(bufhashhdr, buf) *bufhashtbl; */ static struct bufhashhdr invalhash; +#endif + /* * Definitions for the buffer free lists. */ @@ -233,6 +236,7 @@ const char *buf_wmesg = BUF_WMESG; #define VFS_BIO_NEED_FREE 0x04 /* wait for free bufs, hi hysteresis */ #define VFS_BIO_NEED_BUFSPACE 0x08 /* wait for buf space, lo hysteresis */ +#ifdef USE_BUFHASH /* * Buffer hash table code. Note that the logical block scans linearly, which * gives us some L1 cache locality. @@ -245,6 +249,8 @@ bufhash(struct vnode *vnp, daddr_t bn) return(&bufhashtbl[(((uintptr_t)(vnp) >> 7) + (int)bn) & bufhashmask]); } +#endif + /* * numdirtywakeup: * @@ -463,6 +469,7 @@ kern_vfs_bio_buffer_alloc(caddr_t v, int physmem_est) buf = (void *)v; v = (caddr_t)(buf + nbuf); +#ifdef USE_BUFHASH /* * Calculate the hash table size and reserve space */ @@ -471,7 +478,7 @@ kern_vfs_bio_buffer_alloc(caddr_t v, int physmem_est) bufhashtbl = (void *)v; v = (caddr_t)(bufhashtbl + bufhashmask); --bufhashmask; - +#endif return(v); } @@ -484,11 +491,15 @@ bufinit(void) GIANT_REQUIRED; +#ifdef USE_BUFHASH LIST_INIT(&invalhash); +#endif mtx_init(&buftimelock, "buftime lock", NULL, MTX_DEF); +#ifdef USE_BUFHASH for (i = 0; i <= bufhashmask; i++) LIST_INIT(&bufhashtbl[i]); +#endif /* next, make a null set of free lists */ for (i = 0; i < BUFFER_QUEUES; i++) @@ -507,7 +518,9 @@ bufinit(void) LIST_INIT(&bp->b_dep); BUF_LOCKINIT(bp); TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); +#ifdef USE_BUFHASH LIST_INSERT_HEAD(&invalhash, bp, b_hash); +#endif } /* @@ -787,10 +800,15 @@ bwrite(struct buf * bp) /* get a new block */ newbp = geteblk(bp->b_bufsize); - /* set it to be identical to the old block */ + /* + * set it to be identical to the old block. We have to + * set b_lblkno and BKGRDMARKER before calling bgetvp() + * to avoid confusing the splay tree and gbincore(). + */ memcpy(newbp->b_data, bp->b_data, bp->b_bufsize); - bgetvp(bp->b_vp, newbp); newbp->b_lblkno = bp->b_lblkno; + newbp->b_xflags |= BX_BKGRDMARKER; + bgetvp(bp->b_vp, newbp); newbp->b_blkno = bp->b_blkno; newbp->b_offset = bp->b_offset; newbp->b_iodone = vfs_backgroundwritedone; @@ -1302,8 +1320,10 @@ brelse(struct buf * bp) bp->b_qindex = QUEUE_EMPTY; } TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist); +#ifdef USE_BUFHASH LIST_REMOVE(bp, b_hash); LIST_INSERT_HEAD(&invalhash, bp, b_hash); +#endif bp->b_dev = NODEV; /* buffers with junk contents */ } else if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) || @@ -1314,8 +1334,10 @@ brelse(struct buf * bp) panic("losing buffer 2"); bp->b_qindex = QUEUE_CLEAN; TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist); +#ifdef USE_BUFHASH LIST_REMOVE(bp, b_hash); LIST_INSERT_HEAD(&invalhash, bp, b_hash); +#endif bp->b_dev = NODEV; /* buffers that are locked */ @@ -1336,11 +1358,17 @@ brelse(struct buf * bp) } /* - * If B_INVAL, clear B_DELWRI. We've already placed the buffer - * on the correct queue. + * If B_INVAL and B_DELWRI is set, clear B_DELWRI. We have already + * placed the buffer on the correct queue. We must also disassociate + * the device and vnode for a B_INVAL buffer so gbincore() doesn't + * find it. */ - if ((bp->b_flags & (B_INVAL|B_DELWRI)) == (B_INVAL|B_DELWRI)) - bundirty(bp); + if (bp->b_flags & B_INVAL) { + if (bp->b_flags & B_DELWRI) + bundirty(bp); + if (bp->b_vp) + brelvp(bp); + } /* * Fixup numfreebuffers count. The bp is on an appropriate queue @@ -1493,7 +1521,10 @@ vfs_vmio_release(bp) brelvp(bp); } +#ifdef USE_BUFHASH /* + * XXX MOVED TO VFS_SUBR.C + * * Check to see if a block is currently memory resident. */ struct buf * @@ -1514,6 +1545,7 @@ gbincore(struct vnode * vp, daddr_t blkno) } return (bp); } +#endif /* * vfs_bio_awrite: @@ -1782,8 +1814,10 @@ restart: buf_deallocate(bp); if (bp->b_xflags & BX_BKGRDINPROG) panic("losing buffer 3"); +#ifdef USE_BUFHASH LIST_REMOVE(bp, b_hash); LIST_INSERT_HEAD(&invalhash, bp, b_hash); +#endif if (bp->b_bufsize) allocbuf(bp, 0); @@ -2231,7 +2265,9 @@ getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo) { struct buf *bp; int s; +#ifdef USE_BUFHASH struct bufhashhdr *bh; +#endif if (size > MAXBSIZE) panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE); @@ -2392,6 +2428,11 @@ loop: * race because we are safely running at splbio() from the * point of the duplicate buffer creation through to here, * and we've locked the buffer. + * + * Note: this must occur before we associate the buffer + * with the vp especially considering limitations in + * the splay tree implementation when dealing with duplicate + * lblkno's. */ if (gbincore(vp, blkno)) { bp->b_flags |= B_INVAL; @@ -2407,9 +2448,11 @@ loop: bp->b_offset = offset; bgetvp(vp, bp); +#ifdef USE_BUFHASH LIST_REMOVE(bp, b_hash); bh = bufhash(vp, blkno); LIST_INSERT_HEAD(bh, bp, b_hash); +#endif /* * set B_VMIO bit. allocbuf() the buffer bigger. Since the diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c index 4c11952..452dfa1 100644 --- a/sys/kern/vfs_cluster.c +++ b/sys/kern/vfs_cluster.c @@ -152,10 +152,13 @@ cluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp) */ s = splbio(); for (i = 1; i < maxra; i++) { - - if (!(tbp = incore(vp, lblkno+i))) { + /* + * Stop if the buffer does not exist or it + * is invalid (about to go away?) + */ + tbp = gbincore(vp, lblkno+i); + if (tbp == NULL || (tbp->b_flags & B_INVAL)) break; - } /* * Set another read-ahead mark so we know @@ -396,7 +399,8 @@ cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp) * would block in the lock. The same checks have to * be made again after we officially get the buffer. */ - if ((tbp = incore(vp, lbn + i)) != NULL) { + if ((tbp = incore(vp, lbn + i)) != NULL && + (tbp->b_flags & B_INVAL) == 0) { if (BUF_LOCK(tbp, LK_EXCLUSIVE | LK_NOWAIT)) break; BUF_UNLOCK(tbp); diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index 1762a1f..3ebb88e 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -120,15 +120,6 @@ SYSCTL_LONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, ""); */ static int reassignbufcalls; SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, ""); -static int reassignbufloops; -SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW, &reassignbufloops, 0, ""); -static int reassignbufsortgood; -SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW, &reassignbufsortgood, 0, ""); -static int reassignbufsortbad; -SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad, 0, ""); -/* Set to 0 for old insertion-sort based reassignbuf, 1 for modern method. */ -static int reassignbufmethod = 1; -SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, ""); static int nameileafonly; SYSCTL_INT(_vfs, OID_AUTO, nameileafonly, CTLFLAG_RW, &nameileafonly, 0, ""); @@ -812,6 +803,8 @@ getnewvnode(tag, mp, vops, vpp) vp->v_cstart = 0; vp->v_clen = 0; vp->v_socket = 0; + KASSERT(vp->v_cleanblkroot == NULL, ("cleanblkroot not NULL")); + KASSERT(vp->v_dirtyblkroot == NULL, ("dirtyblkroot not NULL")); } else { mtx_unlock(&vnode_free_list_mtx); vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK); @@ -1132,6 +1125,199 @@ restartsync: } /* + * buf_splay() - splay tree core for the clean/dirty list of buffers in + * a vnode. + * + * NOTE: We have to deal with the special case of a background bitmap + * buffer, a situation where two buffers will have the same logical + * block offset. We want (1) only the foreground buffer to be accessed + * in a lookup and (2) must differentiate between the foreground and + * background buffer in the splay tree algorithm because the splay + * tree cannot normally handle multiple entities with the same 'index'. + * We accomplish this by adding differentiating flags to the splay tree's + * numerical domain. + */ +static +struct buf * +buf_splay(daddr_t lblkno, b_xflags_t xflags, struct buf *root) +{ + struct buf dummy; + struct buf *lefttreemax, *righttreemin, *y; + + if (root == NULL) + return (NULL); + lefttreemax = righttreemin = &dummy; + for (;;) { + if (lblkno < root->b_lblkno || + (lblkno == root->b_lblkno && + (xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) { + if ((y = root->b_left) == NULL) + break; + if (lblkno < y->b_lblkno) { + /* Rotate right. */ + root->b_left = y->b_right; + y->b_right = root; + root = y; + if ((y = root->b_left) == NULL) + break; + } + /* Link into the new root's right tree. */ + righttreemin->b_left = root; + righttreemin = root; + } else if (lblkno > root->b_lblkno || + (lblkno == root->b_lblkno && + (xflags & BX_BKGRDMARKER) > (root->b_xflags & BX_BKGRDMARKER))) { + if ((y = root->b_right) == NULL) + break; + if (lblkno > y->b_lblkno) { + /* Rotate left. */ + root->b_right = y->b_left; + y->b_left = root; + root = y; + if ((y = root->b_right) == NULL) + break; + } + /* Link into the new root's left tree. */ + lefttreemax->b_right = root; + lefttreemax = root; + } else { + break; + } + root = y; + } + /* Assemble the new root. */ + lefttreemax->b_right = root->b_left; + righttreemin->b_left = root->b_right; + root->b_left = dummy.b_right; + root->b_right = dummy.b_left; + return (root); +} + +static +void +buf_vlist_remove(struct buf *bp) +{ + struct vnode *vp = bp->b_vp; + struct buf *root; + + if (bp->b_xflags & BX_VNDIRTY) { + if (bp != vp->v_dirtyblkroot) { + root = buf_splay(bp->b_lblkno, bp->b_xflags, vp->v_dirtyblkroot); + KASSERT(root == bp, ("splay lookup failed during dirty remove")); + } + if (bp->b_left == NULL) { + root = bp->b_right; + } else { + root = buf_splay(bp->b_lblkno, bp->b_xflags, bp->b_left); + root->b_right = bp->b_right; + } + vp->v_dirtyblkroot = root; + TAILQ_REMOVE(&vp->v_dirtyblkhd, bp, b_vnbufs); + } else { + /* KASSERT(bp->b_xflags & BX_VNCLEAN, ("bp wasn't clean")); */ + if (bp != vp->v_cleanblkroot) { + root = buf_splay(bp->b_lblkno, bp->b_xflags, vp->v_cleanblkroot); + KASSERT(root == bp, ("splay lookup failed during clean remove")); + } + if (bp->b_left == NULL) { + root = bp->b_right; + } else { + root = buf_splay(bp->b_lblkno, bp->b_xflags, bp->b_left); + root->b_right = bp->b_right; + } + vp->v_cleanblkroot = root; + TAILQ_REMOVE(&vp->v_cleanblkhd, bp, b_vnbufs); + } + bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); +} + +/* + * Add the buffer to the sorted clean or dirty block list using a + * splay tree algorithm. + * + * NOTE: xflags is passed as a constant, optimizing this inline function! + */ +static +void +buf_vlist_add(struct buf *bp, struct vnode *vp, b_xflags_t xflags) +{ + struct buf *root; + + bp->b_xflags |= xflags; + if (xflags & BX_VNDIRTY) { + root = buf_splay(bp->b_lblkno, bp->b_xflags, vp->v_dirtyblkroot); + if (root == NULL) { + bp->b_left = NULL; + bp->b_right = NULL; + TAILQ_INSERT_TAIL(&vp->v_dirtyblkhd, bp, b_vnbufs); + } else if (bp->b_lblkno < root->b_lblkno || + (bp->b_lblkno == root->b_lblkno && + (bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) { + bp->b_left = root->b_left; + bp->b_right = root; + root->b_left = NULL; + TAILQ_INSERT_BEFORE(root, bp, b_vnbufs); + } else { + bp->b_right = root->b_right; + bp->b_left = root; + root->b_right = NULL; + TAILQ_INSERT_AFTER(&vp->v_dirtyblkhd, + root, bp, b_vnbufs); + } + vp->v_dirtyblkroot = bp; + } else { + /* KASSERT(xflags & BX_VNCLEAN, ("xflags not clean")); */ + root = buf_splay(bp->b_lblkno, bp->b_xflags, vp->v_cleanblkroot); + if (root == NULL) { + bp->b_left = NULL; + bp->b_right = NULL; + TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs); + } else if (bp->b_lblkno < root->b_lblkno || + (bp->b_lblkno == root->b_lblkno && + (bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) { + bp->b_left = root->b_left; + bp->b_right = root; + root->b_left = NULL; + TAILQ_INSERT_BEFORE(root, bp, b_vnbufs); + } else { + bp->b_right = root->b_right; + bp->b_left = root; + root->b_right = NULL; + TAILQ_INSERT_AFTER(&vp->v_cleanblkhd, + root, bp, b_vnbufs); + } + vp->v_cleanblkroot = bp; + } +} + +#ifndef USE_BUFHASH + +/* + * Lookup a buffer using the splay tree. Note that we specifically avoid + * shadow buffers used in background bitmap writes. + * + * This code isn't quite efficient as it could be because we are maintaining + * two sorted lists and do not know which list the block resides in. + */ +struct buf * +gbincore(struct vnode *vp, daddr_t lblkno) +{ + struct buf *bp; + + GIANT_REQUIRED; + + bp = vp->v_cleanblkroot = buf_splay(lblkno, 0, vp->v_cleanblkroot); + if (bp && bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER)) + return(bp); + bp = vp->v_dirtyblkroot = buf_splay(lblkno, 0, vp->v_dirtyblkroot); + if (bp && bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER)) + return(bp); + return(NULL); +} + +#endif + +/* * Associate a buffer with a vnode. */ void @@ -1143,6 +1329,9 @@ bgetvp(vp, bp) KASSERT(bp->b_vp == NULL, ("bgetvp: not free")); + KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, + ("bgetvp: bp already attached! %p", bp)); + vhold(vp); bp->b_vp = vp; bp->b_dev = vn_todev(vp); @@ -1150,9 +1339,7 @@ bgetvp(vp, bp) * Insert onto list for new vnode. */ s = splbio(); - bp->b_xflags |= BX_VNCLEAN; - bp->b_xflags &= ~BX_VNDIRTY; - TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs); + buf_vlist_add(bp, vp, BX_VNCLEAN); splx(s); } @@ -1164,7 +1351,6 @@ brelvp(bp) register struct buf *bp; { struct vnode *vp; - struct buflists *listheadp; int s; KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); @@ -1174,14 +1360,8 @@ brelvp(bp) */ vp = bp->b_vp; s = splbio(); - if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) { - if (bp->b_xflags & BX_VNDIRTY) - listheadp = &vp->v_dirtyblkhd; - else - listheadp = &vp->v_cleanblkhd; - TAILQ_REMOVE(listheadp, bp, b_vnbufs); - bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); - } + if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) + buf_vlist_remove(bp); if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) { vp->v_flag &= ~VONWORKLST; LIST_REMOVE(vp, v_synclist); @@ -1396,7 +1576,6 @@ reassignbuf(bp, newvp) register struct buf *bp; register struct vnode *newvp; { - struct buflists *listheadp; int delay; int s; @@ -1418,12 +1597,7 @@ reassignbuf(bp, newvp) * Delete from old vnode list, if on one. */ if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) { - if (bp->b_xflags & BX_VNDIRTY) - listheadp = &bp->b_vp->v_dirtyblkhd; - else - listheadp = &bp->b_vp->v_cleanblkhd; - TAILQ_REMOVE(listheadp, bp, b_vnbufs); - bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); + buf_vlist_remove(bp); if (bp->b_vp != newvp) { vdrop(bp->b_vp); bp->b_vp = NULL; /* for clarification */ @@ -1434,9 +1608,6 @@ reassignbuf(bp, newvp) * of clean buffers. */ if (bp->b_flags & B_DELWRI) { - struct buf *tbp; - - listheadp = &newvp->v_dirtyblkhd; if ((newvp->v_flag & VONWORKLST) == 0) { switch (newvp->v_type) { case VDIR: @@ -1453,61 +1624,10 @@ reassignbuf(bp, newvp) } vn_syncer_add_to_worklist(newvp, delay); } - bp->b_xflags |= BX_VNDIRTY; - tbp = TAILQ_FIRST(listheadp); - if (tbp == NULL || - bp->b_lblkno == 0 || - (bp->b_lblkno > 0 && tbp->b_lblkno < 0) || - (bp->b_lblkno > 0 && bp->b_lblkno < tbp->b_lblkno)) { - TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs); - ++reassignbufsortgood; - } else if (bp->b_lblkno < 0) { - TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs); - ++reassignbufsortgood; - } else if (reassignbufmethod == 1) { - /* - * New sorting algorithm, only handle sequential case, - * otherwise append to end (but before metadata) - */ - if ((tbp = gbincore(newvp, bp->b_lblkno - 1)) != NULL && - (tbp->b_xflags & BX_VNDIRTY)) { - /* - * Found the best place to insert the buffer - */ - TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); - ++reassignbufsortgood; - } else { - /* - * Missed, append to end, but before meta-data. - * We know that the head buffer in the list is - * not meta-data due to prior conditionals. - * - * Indirect effects: NFS second stage write - * tends to wind up here, giving maximum - * distance between the unstable write and the - * commit rpc. - */ - tbp = TAILQ_LAST(listheadp, buflists); - while (tbp && tbp->b_lblkno < 0) - tbp = TAILQ_PREV(tbp, buflists, b_vnbufs); - TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); - ++reassignbufsortbad; - } - } else { - /* - * Old sorting algorithm, scan queue and insert - */ - struct buf *ttbp; - while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) && - (ttbp->b_lblkno < bp->b_lblkno)) { - ++reassignbufloops; - tbp = ttbp; - } - TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); - } + buf_vlist_add(bp, newvp, BX_VNDIRTY); } else { - bp->b_xflags |= BX_VNCLEAN; - TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs); + buf_vlist_add(bp, newvp, BX_VNCLEAN); + if ((newvp->v_flag & VONWORKLST) && TAILQ_EMPTY(&newvp->v_dirtyblkhd)) { newvp->v_flag &= ~VONWORKLST; diff --git a/sys/nfsclient/nfs_bio.c b/sys/nfsclient/nfs_bio.c index 897346d..29f2da4 100644 --- a/sys/nfsclient/nfs_bio.c +++ b/sys/nfsclient/nfs_bio.c @@ -428,7 +428,7 @@ nfs_bioread(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *cred) for (nra = 0; nra < nmp->nm_readahead && nra < seqcount && (off_t)(lbn + 1 + nra) * biosize < np->n_size; nra++) { rabn = lbn + 1 + nra; - if (!incore(vp, rabn)) { + if (incore(vp, rabn) == NULL) { rabp = nfs_getcacheblk(vp, rabn, biosize, td); if (!rabp) return (EINTR); @@ -613,7 +613,7 @@ again: (bp->b_flags & B_INVAL) == 0 && (np->n_direofoffset == 0 || (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) && - !incore(vp, lbn + 1)) { + incore(vp, lbn + 1) == NULL) { rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, td); if (rabp) { if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { diff --git a/sys/nfsserver/nfs_serv.c b/sys/nfsserver/nfs_serv.c index 73bee42..131c0b4 100644 --- a/sys/nfsserver/nfs_serv.c +++ b/sys/nfsserver/nfs_serv.c @@ -3695,8 +3695,14 @@ nfsrv_commit(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp, * If we have a buffer and it is marked B_DELWRI we * have to lock and write it. Otherwise the prior * write is assumed to have already been committed. + * + * gbincore() can return invalid buffers now so we + * have to check that bit as well (though B_DELWRI + * should not be set if B_INVAL is set there could be + * a race here since we haven't locked the buffer). */ - if ((bp = gbincore(vp, lblkno)) != NULL && (bp->b_flags & B_DELWRI)) { + if ((bp = gbincore(vp, lblkno)) != NULL && + (bp->b_flags & (B_DELWRI|B_INVAL)) == B_DELWRI) { if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL); continue; /* retry */ diff --git a/sys/sys/buf.h b/sys/sys/buf.h index 6353276..91a803c 100644 --- a/sys/sys/buf.h +++ b/sys/sys/buf.h @@ -78,6 +78,8 @@ extern struct buf_ops buf_ops_bio; struct vm_object; +typedef unsigned char b_xflags_t; + /* * The buffer header describes an I/O operation in the kernel. * @@ -117,12 +119,16 @@ struct buf { #define B_MAGIC_NFS 0x67238234 void (*b_iodone)(struct buf *); off_t b_offset; /* Offset into file. */ +#ifdef USE_BUFHASH LIST_ENTRY(buf) b_hash; /* Hash chain. */ +#endif TAILQ_ENTRY(buf) b_vnbufs; /* Buffer's associated vnode. */ + struct buf *b_left; /* splay tree link (V) */ + struct buf *b_right; /* splay tree link (V) */ TAILQ_ENTRY(buf) b_freelist; /* Free list position if not active. */ long b_flags; /* B_* flags. */ unsigned short b_qindex; /* buffer queue index */ - unsigned char b_xflags; /* extra flags */ + b_xflags_t b_xflags; /* extra flags */ struct lock b_lock; /* Buffer lock */ long b_bufsize; /* Allocated buffer size. */ long b_runningbufspace; /* when I/O is running, pipelining */ @@ -250,6 +256,7 @@ struct buf { #define BX_BKGRDWRITE 0x00000004 /* Do writes in background */ #define BX_BKGRDINPROG 0x00000008 /* Background write in progress */ #define BX_BKGRDWAIT 0x00000010 /* Background write waiting */ +#define BX_BKGRDMARKER 0x00000020 /* Mark buffer for splay tree */ #define NOOFFSET (-1LL) /* No buffer offset calculated yet */ diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h index bc92a9a..3c9989f 100644 --- a/sys/sys/vnode.h +++ b/sys/sys/vnode.h @@ -108,8 +108,10 @@ struct vnode { vop_t **v_op; /* vnode operations vector */ TAILQ_ENTRY(vnode) v_freelist; /* vnode freelist */ TAILQ_ENTRY(vnode) v_nmntvnodes; /* vnodes for mount point */ - struct buflists v_cleanblkhd; /* clean blocklist head */ - struct buflists v_dirtyblkhd; /* dirty blocklist head */ + struct buflists v_cleanblkhd; /* SORTED clean blocklist */ + struct buf *v_cleanblkroot; /* clean buf splay tree root */ + struct buflists v_dirtyblkhd; /* SORTED dirty blocklist */ + struct buf *v_dirtyblkroot; /* dirty buf splay tree root */ LIST_ENTRY(vnode) v_synclist; /* vnodes with dirty buffers */ long v_numoutput; /* num of writes in progress */ enum vtype v_type; /* vnode type */ |