diff options
author | dillon <dillon@FreeBSD.org> | 2002-07-10 17:02:32 +0000 |
---|---|---|
committer | dillon <dillon@FreeBSD.org> | 2002-07-10 17:02:32 +0000 |
commit | da4e111a550227f9bab8d24999ea76a11cc5b780 (patch) | |
tree | e267b121c9bda07891245d5c4f7da3f13b8e5cdf /sys/kern/vfs_bio.c | |
parent | e555a85c675bd5ce423b4c39497e6569cd3d3e1b (diff) | |
download | FreeBSD-src-da4e111a550227f9bab8d24999ea76a11cc5b780.zip FreeBSD-src-da4e111a550227f9bab8d24999ea76a11cc5b780.tar.gz |
Replace the global buffer hash table with per-vnode splay trees using a
methodology similar to the vm_map_entry splay and the VM splay that Alan
Cox is working on. Extensive testing has appeared to have shown no
increase in overhead.
Disadvantages
Dirties more cache lines during lookups.
Not as fast as a hash table lookup (but still N log N and optimal
when there is locality of reference).
Advantages
vnode->v_dirtyblkhd is now perfectly sorted, making fsync/sync/filesystem
syncer operate more efficiently.
I get to rip out all the old hacks (some of which were mine) that tried
to keep the v_dirtyblkhd tailq sorted.
The per-vnode splay tree should be easier to lock / SMPng pushdown on
vnodes will be easier.
This commit along with another that Alan is working on for the VM page
global hash table will allow me to implement ranged fsync(), optimize
server-side nfs commit rpcs, and implement partial syncs by the
filesystem syncer (aka filesystem syncer would detect that someone is
trying to get the vnode lock, remembers its place, and skip to the
next vnode).
Note that the buffer cache splay is somewhat more complex then other splays
due to special handling of background bitmap writes (multiple buffers with
the same lblkno in the same vnode), and B_INVAL discontinuities between the
old hash table and the existence of the buffer on the v_cleanblkhd list.
Suggested by: alc
Diffstat (limited to 'sys/kern/vfs_bio.c')
-rw-r--r-- | sys/kern/vfs_bio.c | 57 |
1 files changed, 50 insertions, 7 deletions
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index 1f86b68..a3639b4 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -189,6 +189,7 @@ static int runningbufreq; */ static int needsbuffer; +#ifdef USE_BUFHASH /* * Mask for index into the buffer hash table, which needs to be power of 2 in * size. Set in kern_vfs_bio_buffer_alloc. @@ -208,6 +209,8 @@ static LIST_HEAD(bufhashhdr, buf) *bufhashtbl; */ static struct bufhashhdr invalhash; +#endif + /* * Definitions for the buffer free lists. */ @@ -233,6 +236,7 @@ const char *buf_wmesg = BUF_WMESG; #define VFS_BIO_NEED_FREE 0x04 /* wait for free bufs, hi hysteresis */ #define VFS_BIO_NEED_BUFSPACE 0x08 /* wait for buf space, lo hysteresis */ +#ifdef USE_BUFHASH /* * Buffer hash table code. Note that the logical block scans linearly, which * gives us some L1 cache locality. @@ -245,6 +249,8 @@ bufhash(struct vnode *vnp, daddr_t bn) return(&bufhashtbl[(((uintptr_t)(vnp) >> 7) + (int)bn) & bufhashmask]); } +#endif + /* * numdirtywakeup: * @@ -463,6 +469,7 @@ kern_vfs_bio_buffer_alloc(caddr_t v, int physmem_est) buf = (void *)v; v = (caddr_t)(buf + nbuf); +#ifdef USE_BUFHASH /* * Calculate the hash table size and reserve space */ @@ -471,7 +478,7 @@ kern_vfs_bio_buffer_alloc(caddr_t v, int physmem_est) bufhashtbl = (void *)v; v = (caddr_t)(bufhashtbl + bufhashmask); --bufhashmask; - +#endif return(v); } @@ -484,11 +491,15 @@ bufinit(void) GIANT_REQUIRED; +#ifdef USE_BUFHASH LIST_INIT(&invalhash); +#endif mtx_init(&buftimelock, "buftime lock", NULL, MTX_DEF); +#ifdef USE_BUFHASH for (i = 0; i <= bufhashmask; i++) LIST_INIT(&bufhashtbl[i]); +#endif /* next, make a null set of free lists */ for (i = 0; i < BUFFER_QUEUES; i++) @@ -507,7 +518,9 @@ bufinit(void) LIST_INIT(&bp->b_dep); BUF_LOCKINIT(bp); TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); +#ifdef USE_BUFHASH LIST_INSERT_HEAD(&invalhash, bp, b_hash); +#endif } /* @@ -787,10 +800,15 @@ bwrite(struct buf * bp) /* get a new block */ newbp = geteblk(bp->b_bufsize); - /* set it to be identical to the old block */ + /* + * set it to be identical to the old block. We have to + * set b_lblkno and BKGRDMARKER before calling bgetvp() + * to avoid confusing the splay tree and gbincore(). + */ memcpy(newbp->b_data, bp->b_data, bp->b_bufsize); - bgetvp(bp->b_vp, newbp); newbp->b_lblkno = bp->b_lblkno; + newbp->b_xflags |= BX_BKGRDMARKER; + bgetvp(bp->b_vp, newbp); newbp->b_blkno = bp->b_blkno; newbp->b_offset = bp->b_offset; newbp->b_iodone = vfs_backgroundwritedone; @@ -1302,8 +1320,10 @@ brelse(struct buf * bp) bp->b_qindex = QUEUE_EMPTY; } TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist); +#ifdef USE_BUFHASH LIST_REMOVE(bp, b_hash); LIST_INSERT_HEAD(&invalhash, bp, b_hash); +#endif bp->b_dev = NODEV; /* buffers with junk contents */ } else if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) || @@ -1314,8 +1334,10 @@ brelse(struct buf * bp) panic("losing buffer 2"); bp->b_qindex = QUEUE_CLEAN; TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist); +#ifdef USE_BUFHASH LIST_REMOVE(bp, b_hash); LIST_INSERT_HEAD(&invalhash, bp, b_hash); +#endif bp->b_dev = NODEV; /* buffers that are locked */ @@ -1336,11 +1358,17 @@ brelse(struct buf * bp) } /* - * If B_INVAL, clear B_DELWRI. We've already placed the buffer - * on the correct queue. + * If B_INVAL and B_DELWRI is set, clear B_DELWRI. We have already + * placed the buffer on the correct queue. We must also disassociate + * the device and vnode for a B_INVAL buffer so gbincore() doesn't + * find it. */ - if ((bp->b_flags & (B_INVAL|B_DELWRI)) == (B_INVAL|B_DELWRI)) - bundirty(bp); + if (bp->b_flags & B_INVAL) { + if (bp->b_flags & B_DELWRI) + bundirty(bp); + if (bp->b_vp) + brelvp(bp); + } /* * Fixup numfreebuffers count. The bp is on an appropriate queue @@ -1493,7 +1521,10 @@ vfs_vmio_release(bp) brelvp(bp); } +#ifdef USE_BUFHASH /* + * XXX MOVED TO VFS_SUBR.C + * * Check to see if a block is currently memory resident. */ struct buf * @@ -1514,6 +1545,7 @@ gbincore(struct vnode * vp, daddr_t blkno) } return (bp); } +#endif /* * vfs_bio_awrite: @@ -1782,8 +1814,10 @@ restart: buf_deallocate(bp); if (bp->b_xflags & BX_BKGRDINPROG) panic("losing buffer 3"); +#ifdef USE_BUFHASH LIST_REMOVE(bp, b_hash); LIST_INSERT_HEAD(&invalhash, bp, b_hash); +#endif if (bp->b_bufsize) allocbuf(bp, 0); @@ -2231,7 +2265,9 @@ getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo) { struct buf *bp; int s; +#ifdef USE_BUFHASH struct bufhashhdr *bh; +#endif if (size > MAXBSIZE) panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE); @@ -2392,6 +2428,11 @@ loop: * race because we are safely running at splbio() from the * point of the duplicate buffer creation through to here, * and we've locked the buffer. + * + * Note: this must occur before we associate the buffer + * with the vp especially considering limitations in + * the splay tree implementation when dealing with duplicate + * lblkno's. */ if (gbincore(vp, blkno)) { bp->b_flags |= B_INVAL; @@ -2407,9 +2448,11 @@ loop: bp->b_offset = offset; bgetvp(vp, bp); +#ifdef USE_BUFHASH LIST_REMOVE(bp, b_hash); bh = bufhash(vp, blkno); LIST_INSERT_HEAD(bh, bp, b_hash); +#endif /* * set B_VMIO bit. allocbuf() the buffer bigger. Since the |