summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--sys/kern/vfs_bio.c57
-rw-r--r--sys/kern/vfs_cluster.c12
-rw-r--r--sys/kern/vfs_subr.c290
-rw-r--r--sys/nfsclient/nfs_bio.c4
-rw-r--r--sys/nfsserver/nfs_serv.c8
-rw-r--r--sys/sys/buf.h9
-rw-r--r--sys/sys/vnode.h6
7 files changed, 284 insertions, 102 deletions
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
index 1f86b68..a3639b4 100644
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@@ -189,6 +189,7 @@ static int runningbufreq;
*/
static int needsbuffer;
+#ifdef USE_BUFHASH
/*
* Mask for index into the buffer hash table, which needs to be power of 2 in
* size. Set in kern_vfs_bio_buffer_alloc.
@@ -208,6 +209,8 @@ static LIST_HEAD(bufhashhdr, buf) *bufhashtbl;
*/
static struct bufhashhdr invalhash;
+#endif
+
/*
* Definitions for the buffer free lists.
*/
@@ -233,6 +236,7 @@ const char *buf_wmesg = BUF_WMESG;
#define VFS_BIO_NEED_FREE 0x04 /* wait for free bufs, hi hysteresis */
#define VFS_BIO_NEED_BUFSPACE 0x08 /* wait for buf space, lo hysteresis */
+#ifdef USE_BUFHASH
/*
* Buffer hash table code. Note that the logical block scans linearly, which
* gives us some L1 cache locality.
@@ -245,6 +249,8 @@ bufhash(struct vnode *vnp, daddr_t bn)
return(&bufhashtbl[(((uintptr_t)(vnp) >> 7) + (int)bn) & bufhashmask]);
}
+#endif
+
/*
* numdirtywakeup:
*
@@ -463,6 +469,7 @@ kern_vfs_bio_buffer_alloc(caddr_t v, int physmem_est)
buf = (void *)v;
v = (caddr_t)(buf + nbuf);
+#ifdef USE_BUFHASH
/*
* Calculate the hash table size and reserve space
*/
@@ -471,7 +478,7 @@ kern_vfs_bio_buffer_alloc(caddr_t v, int physmem_est)
bufhashtbl = (void *)v;
v = (caddr_t)(bufhashtbl + bufhashmask);
--bufhashmask;
-
+#endif
return(v);
}
@@ -484,11 +491,15 @@ bufinit(void)
GIANT_REQUIRED;
+#ifdef USE_BUFHASH
LIST_INIT(&invalhash);
+#endif
mtx_init(&buftimelock, "buftime lock", NULL, MTX_DEF);
+#ifdef USE_BUFHASH
for (i = 0; i <= bufhashmask; i++)
LIST_INIT(&bufhashtbl[i]);
+#endif
/* next, make a null set of free lists */
for (i = 0; i < BUFFER_QUEUES; i++)
@@ -507,7 +518,9 @@ bufinit(void)
LIST_INIT(&bp->b_dep);
BUF_LOCKINIT(bp);
TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
+#ifdef USE_BUFHASH
LIST_INSERT_HEAD(&invalhash, bp, b_hash);
+#endif
}
/*
@@ -787,10 +800,15 @@ bwrite(struct buf * bp)
/* get a new block */
newbp = geteblk(bp->b_bufsize);
- /* set it to be identical to the old block */
+ /*
+ * set it to be identical to the old block. We have to
+ * set b_lblkno and BKGRDMARKER before calling bgetvp()
+ * to avoid confusing the splay tree and gbincore().
+ */
memcpy(newbp->b_data, bp->b_data, bp->b_bufsize);
- bgetvp(bp->b_vp, newbp);
newbp->b_lblkno = bp->b_lblkno;
+ newbp->b_xflags |= BX_BKGRDMARKER;
+ bgetvp(bp->b_vp, newbp);
newbp->b_blkno = bp->b_blkno;
newbp->b_offset = bp->b_offset;
newbp->b_iodone = vfs_backgroundwritedone;
@@ -1302,8 +1320,10 @@ brelse(struct buf * bp)
bp->b_qindex = QUEUE_EMPTY;
}
TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
+#ifdef USE_BUFHASH
LIST_REMOVE(bp, b_hash);
LIST_INSERT_HEAD(&invalhash, bp, b_hash);
+#endif
bp->b_dev = NODEV;
/* buffers with junk contents */
} else if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) ||
@@ -1314,8 +1334,10 @@ brelse(struct buf * bp)
panic("losing buffer 2");
bp->b_qindex = QUEUE_CLEAN;
TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
+#ifdef USE_BUFHASH
LIST_REMOVE(bp, b_hash);
LIST_INSERT_HEAD(&invalhash, bp, b_hash);
+#endif
bp->b_dev = NODEV;
/* buffers that are locked */
@@ -1336,11 +1358,17 @@ brelse(struct buf * bp)
}
/*
- * If B_INVAL, clear B_DELWRI. We've already placed the buffer
- * on the correct queue.
+ * If B_INVAL and B_DELWRI is set, clear B_DELWRI. We have already
+ * placed the buffer on the correct queue. We must also disassociate
+ * the device and vnode for a B_INVAL buffer so gbincore() doesn't
+ * find it.
*/
- if ((bp->b_flags & (B_INVAL|B_DELWRI)) == (B_INVAL|B_DELWRI))
- bundirty(bp);
+ if (bp->b_flags & B_INVAL) {
+ if (bp->b_flags & B_DELWRI)
+ bundirty(bp);
+ if (bp->b_vp)
+ brelvp(bp);
+ }
/*
* Fixup numfreebuffers count. The bp is on an appropriate queue
@@ -1493,7 +1521,10 @@ vfs_vmio_release(bp)
brelvp(bp);
}
+#ifdef USE_BUFHASH
/*
+ * XXX MOVED TO VFS_SUBR.C
+ *
* Check to see if a block is currently memory resident.
*/
struct buf *
@@ -1514,6 +1545,7 @@ gbincore(struct vnode * vp, daddr_t blkno)
}
return (bp);
}
+#endif
/*
* vfs_bio_awrite:
@@ -1782,8 +1814,10 @@ restart:
buf_deallocate(bp);
if (bp->b_xflags & BX_BKGRDINPROG)
panic("losing buffer 3");
+#ifdef USE_BUFHASH
LIST_REMOVE(bp, b_hash);
LIST_INSERT_HEAD(&invalhash, bp, b_hash);
+#endif
if (bp->b_bufsize)
allocbuf(bp, 0);
@@ -2231,7 +2265,9 @@ getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
{
struct buf *bp;
int s;
+#ifdef USE_BUFHASH
struct bufhashhdr *bh;
+#endif
if (size > MAXBSIZE)
panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE);
@@ -2392,6 +2428,11 @@ loop:
* race because we are safely running at splbio() from the
* point of the duplicate buffer creation through to here,
* and we've locked the buffer.
+ *
+ * Note: this must occur before we associate the buffer
+ * with the vp especially considering limitations in
+ * the splay tree implementation when dealing with duplicate
+ * lblkno's.
*/
if (gbincore(vp, blkno)) {
bp->b_flags |= B_INVAL;
@@ -2407,9 +2448,11 @@ loop:
bp->b_offset = offset;
bgetvp(vp, bp);
+#ifdef USE_BUFHASH
LIST_REMOVE(bp, b_hash);
bh = bufhash(vp, blkno);
LIST_INSERT_HEAD(bh, bp, b_hash);
+#endif
/*
* set B_VMIO bit. allocbuf() the buffer bigger. Since the
diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c
index 4c11952..452dfa1 100644
--- a/sys/kern/vfs_cluster.c
+++ b/sys/kern/vfs_cluster.c
@@ -152,10 +152,13 @@ cluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp)
*/
s = splbio();
for (i = 1; i < maxra; i++) {
-
- if (!(tbp = incore(vp, lblkno+i))) {
+ /*
+ * Stop if the buffer does not exist or it
+ * is invalid (about to go away?)
+ */
+ tbp = gbincore(vp, lblkno+i);
+ if (tbp == NULL || (tbp->b_flags & B_INVAL))
break;
- }
/*
* Set another read-ahead mark so we know
@@ -396,7 +399,8 @@ cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp)
* would block in the lock. The same checks have to
* be made again after we officially get the buffer.
*/
- if ((tbp = incore(vp, lbn + i)) != NULL) {
+ if ((tbp = incore(vp, lbn + i)) != NULL &&
+ (tbp->b_flags & B_INVAL) == 0) {
if (BUF_LOCK(tbp, LK_EXCLUSIVE | LK_NOWAIT))
break;
BUF_UNLOCK(tbp);
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index 1762a1f..3ebb88e 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -120,15 +120,6 @@ SYSCTL_LONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "");
*/
static int reassignbufcalls;
SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, "");
-static int reassignbufloops;
-SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW, &reassignbufloops, 0, "");
-static int reassignbufsortgood;
-SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW, &reassignbufsortgood, 0, "");
-static int reassignbufsortbad;
-SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad, 0, "");
-/* Set to 0 for old insertion-sort based reassignbuf, 1 for modern method. */
-static int reassignbufmethod = 1;
-SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, "");
static int nameileafonly;
SYSCTL_INT(_vfs, OID_AUTO, nameileafonly, CTLFLAG_RW, &nameileafonly, 0, "");
@@ -812,6 +803,8 @@ getnewvnode(tag, mp, vops, vpp)
vp->v_cstart = 0;
vp->v_clen = 0;
vp->v_socket = 0;
+ KASSERT(vp->v_cleanblkroot == NULL, ("cleanblkroot not NULL"));
+ KASSERT(vp->v_dirtyblkroot == NULL, ("dirtyblkroot not NULL"));
} else {
mtx_unlock(&vnode_free_list_mtx);
vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK);
@@ -1132,6 +1125,199 @@ restartsync:
}
/*
+ * buf_splay() - splay tree core for the clean/dirty list of buffers in
+ * a vnode.
+ *
+ * NOTE: We have to deal with the special case of a background bitmap
+ * buffer, a situation where two buffers will have the same logical
+ * block offset. We want (1) only the foreground buffer to be accessed
+ * in a lookup and (2) must differentiate between the foreground and
+ * background buffer in the splay tree algorithm because the splay
+ * tree cannot normally handle multiple entities with the same 'index'.
+ * We accomplish this by adding differentiating flags to the splay tree's
+ * numerical domain.
+ */
+static
+struct buf *
+buf_splay(daddr_t lblkno, b_xflags_t xflags, struct buf *root)
+{
+ struct buf dummy;
+ struct buf *lefttreemax, *righttreemin, *y;
+
+ if (root == NULL)
+ return (NULL);
+ lefttreemax = righttreemin = &dummy;
+ for (;;) {
+ if (lblkno < root->b_lblkno ||
+ (lblkno == root->b_lblkno &&
+ (xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
+ if ((y = root->b_left) == NULL)
+ break;
+ if (lblkno < y->b_lblkno) {
+ /* Rotate right. */
+ root->b_left = y->b_right;
+ y->b_right = root;
+ root = y;
+ if ((y = root->b_left) == NULL)
+ break;
+ }
+ /* Link into the new root's right tree. */
+ righttreemin->b_left = root;
+ righttreemin = root;
+ } else if (lblkno > root->b_lblkno ||
+ (lblkno == root->b_lblkno &&
+ (xflags & BX_BKGRDMARKER) > (root->b_xflags & BX_BKGRDMARKER))) {
+ if ((y = root->b_right) == NULL)
+ break;
+ if (lblkno > y->b_lblkno) {
+ /* Rotate left. */
+ root->b_right = y->b_left;
+ y->b_left = root;
+ root = y;
+ if ((y = root->b_right) == NULL)
+ break;
+ }
+ /* Link into the new root's left tree. */
+ lefttreemax->b_right = root;
+ lefttreemax = root;
+ } else {
+ break;
+ }
+ root = y;
+ }
+ /* Assemble the new root. */
+ lefttreemax->b_right = root->b_left;
+ righttreemin->b_left = root->b_right;
+ root->b_left = dummy.b_right;
+ root->b_right = dummy.b_left;
+ return (root);
+}
+
+static
+void
+buf_vlist_remove(struct buf *bp)
+{
+ struct vnode *vp = bp->b_vp;
+ struct buf *root;
+
+ if (bp->b_xflags & BX_VNDIRTY) {
+ if (bp != vp->v_dirtyblkroot) {
+ root = buf_splay(bp->b_lblkno, bp->b_xflags, vp->v_dirtyblkroot);
+ KASSERT(root == bp, ("splay lookup failed during dirty remove"));
+ }
+ if (bp->b_left == NULL) {
+ root = bp->b_right;
+ } else {
+ root = buf_splay(bp->b_lblkno, bp->b_xflags, bp->b_left);
+ root->b_right = bp->b_right;
+ }
+ vp->v_dirtyblkroot = root;
+ TAILQ_REMOVE(&vp->v_dirtyblkhd, bp, b_vnbufs);
+ } else {
+ /* KASSERT(bp->b_xflags & BX_VNCLEAN, ("bp wasn't clean")); */
+ if (bp != vp->v_cleanblkroot) {
+ root = buf_splay(bp->b_lblkno, bp->b_xflags, vp->v_cleanblkroot);
+ KASSERT(root == bp, ("splay lookup failed during clean remove"));
+ }
+ if (bp->b_left == NULL) {
+ root = bp->b_right;
+ } else {
+ root = buf_splay(bp->b_lblkno, bp->b_xflags, bp->b_left);
+ root->b_right = bp->b_right;
+ }
+ vp->v_cleanblkroot = root;
+ TAILQ_REMOVE(&vp->v_cleanblkhd, bp, b_vnbufs);
+ }
+ bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
+}
+
+/*
+ * Add the buffer to the sorted clean or dirty block list using a
+ * splay tree algorithm.
+ *
+ * NOTE: xflags is passed as a constant, optimizing this inline function!
+ */
+static
+void
+buf_vlist_add(struct buf *bp, struct vnode *vp, b_xflags_t xflags)
+{
+ struct buf *root;
+
+ bp->b_xflags |= xflags;
+ if (xflags & BX_VNDIRTY) {
+ root = buf_splay(bp->b_lblkno, bp->b_xflags, vp->v_dirtyblkroot);
+ if (root == NULL) {
+ bp->b_left = NULL;
+ bp->b_right = NULL;
+ TAILQ_INSERT_TAIL(&vp->v_dirtyblkhd, bp, b_vnbufs);
+ } else if (bp->b_lblkno < root->b_lblkno ||
+ (bp->b_lblkno == root->b_lblkno &&
+ (bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
+ bp->b_left = root->b_left;
+ bp->b_right = root;
+ root->b_left = NULL;
+ TAILQ_INSERT_BEFORE(root, bp, b_vnbufs);
+ } else {
+ bp->b_right = root->b_right;
+ bp->b_left = root;
+ root->b_right = NULL;
+ TAILQ_INSERT_AFTER(&vp->v_dirtyblkhd,
+ root, bp, b_vnbufs);
+ }
+ vp->v_dirtyblkroot = bp;
+ } else {
+ /* KASSERT(xflags & BX_VNCLEAN, ("xflags not clean")); */
+ root = buf_splay(bp->b_lblkno, bp->b_xflags, vp->v_cleanblkroot);
+ if (root == NULL) {
+ bp->b_left = NULL;
+ bp->b_right = NULL;
+ TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs);
+ } else if (bp->b_lblkno < root->b_lblkno ||
+ (bp->b_lblkno == root->b_lblkno &&
+ (bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
+ bp->b_left = root->b_left;
+ bp->b_right = root;
+ root->b_left = NULL;
+ TAILQ_INSERT_BEFORE(root, bp, b_vnbufs);
+ } else {
+ bp->b_right = root->b_right;
+ bp->b_left = root;
+ root->b_right = NULL;
+ TAILQ_INSERT_AFTER(&vp->v_cleanblkhd,
+ root, bp, b_vnbufs);
+ }
+ vp->v_cleanblkroot = bp;
+ }
+}
+
+#ifndef USE_BUFHASH
+
+/*
+ * Lookup a buffer using the splay tree. Note that we specifically avoid
+ * shadow buffers used in background bitmap writes.
+ *
+ * This code isn't quite efficient as it could be because we are maintaining
+ * two sorted lists and do not know which list the block resides in.
+ */
+struct buf *
+gbincore(struct vnode *vp, daddr_t lblkno)
+{
+ struct buf *bp;
+
+ GIANT_REQUIRED;
+
+ bp = vp->v_cleanblkroot = buf_splay(lblkno, 0, vp->v_cleanblkroot);
+ if (bp && bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
+ return(bp);
+ bp = vp->v_dirtyblkroot = buf_splay(lblkno, 0, vp->v_dirtyblkroot);
+ if (bp && bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
+ return(bp);
+ return(NULL);
+}
+
+#endif
+
+/*
* Associate a buffer with a vnode.
*/
void
@@ -1143,6 +1329,9 @@ bgetvp(vp, bp)
KASSERT(bp->b_vp == NULL, ("bgetvp: not free"));
+ KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
+ ("bgetvp: bp already attached! %p", bp));
+
vhold(vp);
bp->b_vp = vp;
bp->b_dev = vn_todev(vp);
@@ -1150,9 +1339,7 @@ bgetvp(vp, bp)
* Insert onto list for new vnode.
*/
s = splbio();
- bp->b_xflags |= BX_VNCLEAN;
- bp->b_xflags &= ~BX_VNDIRTY;
- TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs);
+ buf_vlist_add(bp, vp, BX_VNCLEAN);
splx(s);
}
@@ -1164,7 +1351,6 @@ brelvp(bp)
register struct buf *bp;
{
struct vnode *vp;
- struct buflists *listheadp;
int s;
KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
@@ -1174,14 +1360,8 @@ brelvp(bp)
*/
vp = bp->b_vp;
s = splbio();
- if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) {
- if (bp->b_xflags & BX_VNDIRTY)
- listheadp = &vp->v_dirtyblkhd;
- else
- listheadp = &vp->v_cleanblkhd;
- TAILQ_REMOVE(listheadp, bp, b_vnbufs);
- bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
- }
+ if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
+ buf_vlist_remove(bp);
if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
vp->v_flag &= ~VONWORKLST;
LIST_REMOVE(vp, v_synclist);
@@ -1396,7 +1576,6 @@ reassignbuf(bp, newvp)
register struct buf *bp;
register struct vnode *newvp;
{
- struct buflists *listheadp;
int delay;
int s;
@@ -1418,12 +1597,7 @@ reassignbuf(bp, newvp)
* Delete from old vnode list, if on one.
*/
if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) {
- if (bp->b_xflags & BX_VNDIRTY)
- listheadp = &bp->b_vp->v_dirtyblkhd;
- else
- listheadp = &bp->b_vp->v_cleanblkhd;
- TAILQ_REMOVE(listheadp, bp, b_vnbufs);
- bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
+ buf_vlist_remove(bp);
if (bp->b_vp != newvp) {
vdrop(bp->b_vp);
bp->b_vp = NULL; /* for clarification */
@@ -1434,9 +1608,6 @@ reassignbuf(bp, newvp)
* of clean buffers.
*/
if (bp->b_flags & B_DELWRI) {
- struct buf *tbp;
-
- listheadp = &newvp->v_dirtyblkhd;
if ((newvp->v_flag & VONWORKLST) == 0) {
switch (newvp->v_type) {
case VDIR:
@@ -1453,61 +1624,10 @@ reassignbuf(bp, newvp)
}
vn_syncer_add_to_worklist(newvp, delay);
}
- bp->b_xflags |= BX_VNDIRTY;
- tbp = TAILQ_FIRST(listheadp);
- if (tbp == NULL ||
- bp->b_lblkno == 0 ||
- (bp->b_lblkno > 0 && tbp->b_lblkno < 0) ||
- (bp->b_lblkno > 0 && bp->b_lblkno < tbp->b_lblkno)) {
- TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs);
- ++reassignbufsortgood;
- } else if (bp->b_lblkno < 0) {
- TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs);
- ++reassignbufsortgood;
- } else if (reassignbufmethod == 1) {
- /*
- * New sorting algorithm, only handle sequential case,
- * otherwise append to end (but before metadata)
- */
- if ((tbp = gbincore(newvp, bp->b_lblkno - 1)) != NULL &&
- (tbp->b_xflags & BX_VNDIRTY)) {
- /*
- * Found the best place to insert the buffer
- */
- TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
- ++reassignbufsortgood;
- } else {
- /*
- * Missed, append to end, but before meta-data.
- * We know that the head buffer in the list is
- * not meta-data due to prior conditionals.
- *
- * Indirect effects: NFS second stage write
- * tends to wind up here, giving maximum
- * distance between the unstable write and the
- * commit rpc.
- */
- tbp = TAILQ_LAST(listheadp, buflists);
- while (tbp && tbp->b_lblkno < 0)
- tbp = TAILQ_PREV(tbp, buflists, b_vnbufs);
- TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
- ++reassignbufsortbad;
- }
- } else {
- /*
- * Old sorting algorithm, scan queue and insert
- */
- struct buf *ttbp;
- while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) &&
- (ttbp->b_lblkno < bp->b_lblkno)) {
- ++reassignbufloops;
- tbp = ttbp;
- }
- TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
- }
+ buf_vlist_add(bp, newvp, BX_VNDIRTY);
} else {
- bp->b_xflags |= BX_VNCLEAN;
- TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs);
+ buf_vlist_add(bp, newvp, BX_VNCLEAN);
+
if ((newvp->v_flag & VONWORKLST) &&
TAILQ_EMPTY(&newvp->v_dirtyblkhd)) {
newvp->v_flag &= ~VONWORKLST;
diff --git a/sys/nfsclient/nfs_bio.c b/sys/nfsclient/nfs_bio.c
index 897346d..29f2da4 100644
--- a/sys/nfsclient/nfs_bio.c
+++ b/sys/nfsclient/nfs_bio.c
@@ -428,7 +428,7 @@ nfs_bioread(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *cred)
for (nra = 0; nra < nmp->nm_readahead && nra < seqcount &&
(off_t)(lbn + 1 + nra) * biosize < np->n_size; nra++) {
rabn = lbn + 1 + nra;
- if (!incore(vp, rabn)) {
+ if (incore(vp, rabn) == NULL) {
rabp = nfs_getcacheblk(vp, rabn, biosize, td);
if (!rabp)
return (EINTR);
@@ -613,7 +613,7 @@ again:
(bp->b_flags & B_INVAL) == 0 &&
(np->n_direofoffset == 0 ||
(lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
- !incore(vp, lbn + 1)) {
+ incore(vp, lbn + 1) == NULL) {
rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, td);
if (rabp) {
if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
diff --git a/sys/nfsserver/nfs_serv.c b/sys/nfsserver/nfs_serv.c
index 73bee42..131c0b4 100644
--- a/sys/nfsserver/nfs_serv.c
+++ b/sys/nfsserver/nfs_serv.c
@@ -3695,8 +3695,14 @@ nfsrv_commit(struct nfsrv_descript *nfsd, struct nfssvc_sock *slp,
* If we have a buffer and it is marked B_DELWRI we
* have to lock and write it. Otherwise the prior
* write is assumed to have already been committed.
+ *
+ * gbincore() can return invalid buffers now so we
+ * have to check that bit as well (though B_DELWRI
+ * should not be set if B_INVAL is set there could be
+ * a race here since we haven't locked the buffer).
*/
- if ((bp = gbincore(vp, lblkno)) != NULL && (bp->b_flags & B_DELWRI)) {
+ if ((bp = gbincore(vp, lblkno)) != NULL &&
+ (bp->b_flags & (B_DELWRI|B_INVAL)) == B_DELWRI) {
if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL);
continue; /* retry */
diff --git a/sys/sys/buf.h b/sys/sys/buf.h
index 6353276..91a803c 100644
--- a/sys/sys/buf.h
+++ b/sys/sys/buf.h
@@ -78,6 +78,8 @@ extern struct buf_ops buf_ops_bio;
struct vm_object;
+typedef unsigned char b_xflags_t;
+
/*
* The buffer header describes an I/O operation in the kernel.
*
@@ -117,12 +119,16 @@ struct buf {
#define B_MAGIC_NFS 0x67238234
void (*b_iodone)(struct buf *);
off_t b_offset; /* Offset into file. */
+#ifdef USE_BUFHASH
LIST_ENTRY(buf) b_hash; /* Hash chain. */
+#endif
TAILQ_ENTRY(buf) b_vnbufs; /* Buffer's associated vnode. */
+ struct buf *b_left; /* splay tree link (V) */
+ struct buf *b_right; /* splay tree link (V) */
TAILQ_ENTRY(buf) b_freelist; /* Free list position if not active. */
long b_flags; /* B_* flags. */
unsigned short b_qindex; /* buffer queue index */
- unsigned char b_xflags; /* extra flags */
+ b_xflags_t b_xflags; /* extra flags */
struct lock b_lock; /* Buffer lock */
long b_bufsize; /* Allocated buffer size. */
long b_runningbufspace; /* when I/O is running, pipelining */
@@ -250,6 +256,7 @@ struct buf {
#define BX_BKGRDWRITE 0x00000004 /* Do writes in background */
#define BX_BKGRDINPROG 0x00000008 /* Background write in progress */
#define BX_BKGRDWAIT 0x00000010 /* Background write waiting */
+#define BX_BKGRDMARKER 0x00000020 /* Mark buffer for splay tree */
#define NOOFFSET (-1LL) /* No buffer offset calculated yet */
diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h
index bc92a9a..3c9989f 100644
--- a/sys/sys/vnode.h
+++ b/sys/sys/vnode.h
@@ -108,8 +108,10 @@ struct vnode {
vop_t **v_op; /* vnode operations vector */
TAILQ_ENTRY(vnode) v_freelist; /* vnode freelist */
TAILQ_ENTRY(vnode) v_nmntvnodes; /* vnodes for mount point */
- struct buflists v_cleanblkhd; /* clean blocklist head */
- struct buflists v_dirtyblkhd; /* dirty blocklist head */
+ struct buflists v_cleanblkhd; /* SORTED clean blocklist */
+ struct buf *v_cleanblkroot; /* clean buf splay tree root */
+ struct buflists v_dirtyblkhd; /* SORTED dirty blocklist */
+ struct buf *v_dirtyblkroot; /* dirty buf splay tree root */
LIST_ENTRY(vnode) v_synclist; /* vnodes with dirty buffers */
long v_numoutput; /* num of writes in progress */
enum vtype v_type; /* vnode type */
OpenPOWER on IntegriCloud