summaryrefslogtreecommitdiffstats
path: root/sys/kern
diff options
context:
space:
mode:
authorjeff <jeff@FreeBSD.org>2013-06-05 23:53:00 +0000
committerjeff <jeff@FreeBSD.org>2013-06-05 23:53:00 +0000
commit38a637b8a0c41db72b88be9e4e6962f2981be080 (patch)
tree312fac452bdd477ef171cb7c662de6b82069bf35 /sys/kern
parentdea53e7c2f1e0765e02b0f5867388d9325b71e43 (diff)
downloadFreeBSD-src-38a637b8a0c41db72b88be9e4e6962f2981be080.zip
FreeBSD-src-38a637b8a0c41db72b88be9e4e6962f2981be080.tar.gz
- Consolidate duplicate code into support functions.
- Split the bqlock into bqclean and bqdirty locks. - Only acquire the wakeup synchronization locks when we cross a threshold requiring them. - Restructure the way flushbufqueues() targets work so they are more smp friendly and sane. Reviewed by: kib Discussed with: mckusick, attilio Sponsored by: EMC / Isilon Storage Division M vfs_bio.c
Diffstat (limited to 'sys/kern')
-rw-r--r--sys/kern/vfs_bio.c569
1 files changed, 305 insertions, 264 deletions
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
index 03ba78c..767ed30 100644
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@@ -113,10 +113,11 @@ static void vfs_setdirty_locked_object(struct buf *bp);
static void vfs_vmio_release(struct buf *bp);
static int vfs_bio_clcheck(struct vnode *vp, int size,
daddr_t lblkno, daddr_t blkno);
-static int buf_do_flush(struct vnode *vp);
+static int buf_flush(struct vnode *vp, int);
static int flushbufqueues(struct vnode *, int, int);
static void buf_daemon(void);
static void bremfreel(struct buf *bp);
+static __inline void bd_wakeup(void);
#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
static int sysctl_bufspace(SYSCTL_HANDLER_ARGS);
@@ -217,8 +218,8 @@ SYSCTL_INT(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RW, &mappingrestarts, 0,
static int flushbufqtarget = 100;
SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0,
"Amount of work to do in flushbufqueues when helping bufdaemon");
-static long notbufdflashes;
-SYSCTL_LONG(_vfs, OID_AUTO, notbufdflashes, CTLFLAG_RD, &notbufdflashes, 0,
+static long notbufdflushes;
+SYSCTL_LONG(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, &notbufdflushes, 0,
"Number of dirty buffer flushes done by the bufdaemon helpers");
static long barrierwrites;
SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW, &barrierwrites, 0,
@@ -228,6 +229,37 @@ SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_allowed, CTLFLAG_RD,
"Permit the use of the unmapped i/o");
/*
+ * Lock for the non-dirty bufqueues
+ */
+static struct mtx_padalign bqclean;
+
+/*
+ * Lock for the dirty queue.
+ */
+static struct mtx_padalign bqdirty;
+
+/*
+ * This lock synchronizes access to bd_request.
+ */
+static struct mtx_padalign bdlock;
+
+/*
+ * This lock protects the runningbufreq and synchronizes runningbufwakeup and
+ * waitrunningbufspace().
+ */
+static struct mtx_padalign rbreqlock;
+
+/*
+ * Lock that protects needsbuffer and the sleeps/wakeups surrounding it.
+ */
+static struct mtx_padalign nblock;
+
+/*
+ * Lock that protects bdirtywait.
+ */
+static struct mtx_padalign bdirtylock;
+
+/*
* Wakeup point for bufdaemon, as well as indicator of whether it is already
* active. Set to 1 when the bufdaemon is already "on" the queue, 0 when it
* is idling.
@@ -243,11 +275,6 @@ static int bd_request;
static int bd_speedupreq;
/*
- * This lock synchronizes access to bd_request.
- */
-static struct mtx bdlock;
-
-/*
* bogus page -- for I/O to/from partially complete buffers
* this is a temporary solution to the problem, but it is not
* really that bad. it would be better to split the buffer
@@ -263,25 +290,19 @@ vm_page_t bogus_page;
*/
static int runningbufreq;
-/*
- * This lock protects the runningbufreq and synchronizes runningbufwakeup and
- * waitrunningbufspace().
- */
-static struct mtx rbreqlock;
-
/*
* Synchronization (sleep/wakeup) variable for buffer requests.
* Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done
* by and/or.
- * Used in numdirtywakeup(), bufspacewakeup(), bufcountwakeup(), bwillwrite(),
+ * Used in numdirtywakeup(), bufspacewakeup(), bufcountadd(), bwillwrite(),
* getnewbuf(), and getblk().
*/
static int needsbuffer;
/*
- * Lock that protects needsbuffer and the sleeps/wakeups surrounding it.
+ * Synchronization for bwillwrite() waiters.
*/
-static struct mtx nblock;
+static int bdirtywait;
/*
* Definitions for the buffer free lists.
@@ -301,9 +322,6 @@ static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } };
static int bq_len[BUFFER_QUEUES];
#endif
-/* Lock for the bufqueues */
-static struct mtx bqlock;
-
/*
* Single global constant for BUF_WMESG, to avoid getting multiple references.
* buf_wmesg is referred from macros.
@@ -311,7 +329,6 @@ static struct mtx bqlock;
const char *buf_wmesg = BUF_WMESG;
#define VFS_BIO_NEED_ANY 0x01 /* any freeable buffer */
-#define VFS_BIO_NEED_DIRTYFLUSH 0x02 /* waiting for dirty buffer flush */
#define VFS_BIO_NEED_FREE 0x04 /* wait for free bufs, hi hysteresis */
#define VFS_BIO_NEED_BUFSPACE 0x08 /* wait for buf space, lo hysteresis */
@@ -337,25 +354,69 @@ sysctl_bufspace(SYSCTL_HANDLER_ARGS)
#ifdef DIRECTIO
extern void ffs_rawread_setup(void);
#endif /* DIRECTIO */
+
/*
- * numdirtywakeup:
+ * bqlock:
*
- * If someone is blocked due to there being too many dirty buffers,
- * and numdirtybuffers is now reasonable, wake them up.
+ * Return the appropriate queue lock based on the index.
*/
-
-static __inline void
-numdirtywakeup(int level)
+static inline struct mtx *
+bqlock(int qindex)
{
- if (numdirtybuffers <= level) {
- mtx_lock(&nblock);
- if (needsbuffer & VFS_BIO_NEED_DIRTYFLUSH) {
- needsbuffer &= ~VFS_BIO_NEED_DIRTYFLUSH;
- wakeup(&needsbuffer);
- }
- mtx_unlock(&nblock);
+ if (qindex == QUEUE_DIRTY)
+ return (struct mtx *)(&bqdirty);
+ return (struct mtx *)(&bqclean);
+}
+
+/*
+ * bdirtywakeup:
+ *
+ * Wakeup any bwillwrite() waiters.
+ */
+static void
+bdirtywakeup(void)
+{
+ mtx_lock(&bdirtylock);
+ if (bdirtywait) {
+ bdirtywait = 0;
+ wakeup(&bdirtywait);
}
+ mtx_unlock(&bdirtylock);
+}
+
+/*
+ * bdirtysub:
+ *
+ * Decrement the numdirtybuffers count by one and wakeup any
+ * threads blocked in bwillwrite().
+ */
+static void
+bdirtysub(void)
+{
+
+ if (atomic_fetchadd_int(&numdirtybuffers, -1) ==
+ (lodirtybuffers + hidirtybuffers) / 2)
+ bdirtywakeup();
+}
+
+/*
+ * bdirtyadd:
+ *
+ * Increment the numdirtybuffers count by one and wakeup the buf
+ * daemon if needed.
+ */
+static void
+bdirtyadd(void)
+{
+
+ /*
+ * Only do the wakeup once as we cross the boundary. The
+ * buf daemon will keep running until the condition clears.
+ */
+ if (atomic_fetchadd_int(&numdirtybuffers, 1) ==
+ (lodirtybuffers + hidirtybuffers) / 2)
+ bd_wakeup();
}
/*
@@ -385,36 +446,59 @@ bufspacewakeup(void)
}
/*
- * runningbufwakeup() - in-progress I/O accounting.
+ * runningwakeup:
*
+ * Wake up processes that are waiting on asynchronous writes to fall
+ * below lorunningspace.
+ */
+static void
+runningwakeup(void)
+{
+
+ mtx_lock(&rbreqlock);
+ if (runningbufreq) {
+ runningbufreq = 0;
+ wakeup(&runningbufreq);
+ }
+ mtx_unlock(&rbreqlock);
+}
+
+/*
+ * runningbufwakeup:
+ *
+ * Decrement the outstanding write count according.
*/
void
runningbufwakeup(struct buf *bp)
{
+ long space, bspace;
- if (bp->b_runningbufspace) {
- atomic_subtract_long(&runningbufspace, bp->b_runningbufspace);
- bp->b_runningbufspace = 0;
- mtx_lock(&rbreqlock);
- if (runningbufreq && runningbufspace <= lorunningspace) {
- runningbufreq = 0;
- wakeup(&runningbufreq);
- }
- mtx_unlock(&rbreqlock);
- }
+ if (bp->b_runningbufspace == 0)
+ return;
+ space = atomic_fetchadd_long(&runningbufspace, -bp->b_runningbufspace);
+ bspace = bp->b_runningbufspace;
+ bp->b_runningbufspace = 0;
+ /*
+ * Only acquire the lock and wakeup on the transition from exceeding
+ * the threshold to falling below it.
+ */
+ if (space < lorunningspace)
+ return;
+ if (space - bspace > lorunningspace)
+ return;
+ runningwakeup();
}
/*
- * bufcountwakeup:
+ * bufcountadd:
*
* Called when a buffer has been added to one of the free queues to
* account for the buffer and to wakeup anyone waiting for free buffers.
* This typically occurs when large amounts of metadata are being handled
* by the buffer cache ( else buffer space runs out first, usually ).
*/
-
static __inline void
-bufcountwakeup(struct buf *bp)
+bufcountadd(struct buf *bp)
{
int old;
@@ -435,6 +519,30 @@ bufcountwakeup(struct buf *bp)
}
/*
+ * bufcountsub:
+ *
+ * Decrement the numfreebuffers count as needed.
+ */
+static void
+bufcountsub(struct buf *bp)
+{
+ int old;
+
+ /*
+ * Fixup numfreebuffers count. If the buffer is invalid or not
+ * delayed-write, the buffer was free and we must decrement
+ * numfreebuffers.
+ */
+ if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) {
+ KASSERT((bp->b_flags & B_INFREECNT) != 0,
+ ("buf %p not counted in numfreebuffers", bp));
+ bp->b_flags &= ~B_INFREECNT;
+ old = atomic_fetchadd_int(&numfreebuffers, -1);
+ KASSERT(old > 0, ("numfreebuffers dropped to %d", old - 1));
+ }
+}
+
+/*
* waitrunningbufspace()
*
* runningbufspace is a measure of the amount of I/O currently
@@ -442,9 +550,6 @@ bufcountwakeup(struct buf *bp)
* prevent creating huge backups of pending writes to a device.
* Only asynchronous writes are governed by this function.
*
- * Reads will adjust runningbufspace, but will not block based on it.
- * The read load has a side effect of reducing the allowed write load.
- *
* This does NOT turn an async write into a sync write. It waits
* for earlier writes to complete and generally returns before the
* caller's write has reached the device.
@@ -485,13 +590,12 @@ vfs_buf_test_cache(struct buf *bp,
}
/* Wake up the buffer daemon if necessary */
-static __inline
-void
-bd_wakeup(int dirtybuflevel)
+static __inline void
+bd_wakeup(void)
{
mtx_lock(&bdlock);
- if (bd_request == 0 && numdirtybuffers >= dirtybuflevel) {
+ if (bd_request == 0) {
bd_request = 1;
wakeup(&bd_request);
}
@@ -501,7 +605,6 @@ bd_wakeup(int dirtybuflevel)
/*
* bd_speedup - speedup the buffer cache flushing code
*/
-
void
bd_speedup(void)
{
@@ -656,10 +759,12 @@ bufinit(void)
struct buf *bp;
int i;
- mtx_init(&bqlock, "buf queue lock", NULL, MTX_DEF);
+ mtx_init(&bqclean, "bufq clean lock", NULL, MTX_DEF);
+ mtx_init(&bqdirty, "bufq dirty lock", NULL, MTX_DEF);
mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF);
mtx_init(&nblock, "needsbuffer lock", NULL, MTX_DEF);
mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF);
+ mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF);
/* next, make a null set of free lists */
for (i = 0; i < BUFFER_QUEUES; i++)
@@ -831,15 +936,60 @@ bfreekva(struct buf *bp)
}
/*
+ * binsfree:
+ *
+ * Insert the buffer into the appropriate free list.
+ */
+static void
+binsfree(struct buf *bp, int qindex)
+{
+ struct mtx *olock, *nlock;
+
+ BUF_ASSERT_XLOCKED(bp);
+
+ olock = bqlock(bp->b_qindex);
+ nlock = bqlock(qindex);
+ mtx_lock(olock);
+ /* Handle delayed bremfree() processing. */
+ if (bp->b_flags & B_REMFREE)
+ bremfreel(bp);
+
+ if (bp->b_qindex != QUEUE_NONE)
+ panic("binsfree: free buffer onto another queue???");
+
+ bp->b_qindex = qindex;
+ if (olock != nlock) {
+ mtx_unlock(olock);
+ mtx_lock(nlock);
+ }
+ if (bp->b_flags & B_AGE)
+ TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
+ else
+ TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
+#ifdef INVARIANTS
+ bq_len[bp->b_qindex]++;
+#endif
+ mtx_unlock(nlock);
+
+ /*
+ * Something we can maybe free or reuse.
+ */
+ if (bp->b_bufsize && !(bp->b_flags & B_DELWRI))
+ bufspacewakeup();
+
+ if ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))
+ bufcountadd(bp);
+}
+
+/*
* bremfree:
*
- * Mark the buffer for removal from the appropriate free list in brelse.
+ * Mark the buffer for removal from the appropriate free list.
*
*/
void
bremfree(struct buf *bp)
{
- int old;
CTR3(KTR_BUF, "bremfree(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
KASSERT((bp->b_flags & B_REMFREE) == 0,
@@ -849,14 +999,7 @@ bremfree(struct buf *bp)
BUF_ASSERT_XLOCKED(bp);
bp->b_flags |= B_REMFREE;
- /* Fixup numfreebuffers count. */
- if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) {
- KASSERT((bp->b_flags & B_INFREECNT) != 0,
- ("buf %p not counted in numfreebuffers", bp));
- bp->b_flags &= ~B_INFREECNT;
- old = atomic_fetchadd_int(&numfreebuffers, -1);
- KASSERT(old > 0, ("numfreebuffers dropped to %d", old - 1));
- }
+ bufcountsub(bp);
}
/*
@@ -868,28 +1011,30 @@ bremfree(struct buf *bp)
void
bremfreef(struct buf *bp)
{
- mtx_lock(&bqlock);
+ struct mtx *qlock;
+
+ qlock = bqlock(bp->b_qindex);
+ mtx_lock(qlock);
bremfreel(bp);
- mtx_unlock(&bqlock);
+ mtx_unlock(qlock);
}
/*
* bremfreel:
*
* Removes a buffer from the free list, must be called with the
- * bqlock held.
+ * correct qlock held.
*/
static void
bremfreel(struct buf *bp)
{
- int old;
CTR3(KTR_BUF, "bremfreel(%p) vp %p flags %X",
bp, bp->b_vp, bp->b_flags);
KASSERT(bp->b_qindex != QUEUE_NONE,
("bremfreel: buffer %p not on a queue.", bp));
BUF_ASSERT_XLOCKED(bp);
- mtx_assert(&bqlock, MA_OWNED);
+ mtx_assert(bqlock(bp->b_qindex), MA_OWNED);
TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
#ifdef INVARIANTS
@@ -906,18 +1051,7 @@ bremfreel(struct buf *bp)
bp->b_flags &= ~B_REMFREE;
return;
}
- /*
- * Fixup numfreebuffers count. If the buffer is invalid or not
- * delayed-write, the buffer was free and we must decrement
- * numfreebuffers.
- */
- if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) {
- KASSERT((bp->b_flags & B_INFREECNT) != 0,
- ("buf %p not counted in numfreebuffers", bp));
- bp->b_flags &= ~B_INFREECNT;
- old = atomic_fetchadd_int(&numfreebuffers, -1);
- KASSERT(old > 0, ("numfreebuffers dropped to %d", old - 1));
- }
+ bufcountsub(bp);
}
/*
@@ -1018,6 +1152,7 @@ bufwrite(struct buf *bp)
{
int oldflags;
struct vnode *vp;
+ long space;
int vp_md;
CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
@@ -1065,7 +1200,7 @@ bufwrite(struct buf *bp)
* Normal bwrites pipeline writes
*/
bp->b_runningbufspace = bp->b_bufsize;
- atomic_add_long(&runningbufspace, bp->b_runningbufspace);
+ space = atomic_fetchadd_long(&runningbufspace, bp->b_runningbufspace);
if (!TD_IS_IDLETHREAD(curthread))
curthread->td_ru.ru_oublock++;
@@ -1078,7 +1213,7 @@ bufwrite(struct buf *bp)
int rtval = bufwait(bp);
brelse(bp);
return (rtval);
- } else {
+ } else if (space > hirunningspace) {
/*
* don't allow the async write to saturate the I/O
* system. We will not deadlock here because
@@ -1213,13 +1348,6 @@ bdwrite(struct buf *bp)
bqrelse(bp);
/*
- * Wakeup the buffer flushing daemon if we have a lot of dirty
- * buffers (midpoint between our recovery point and our stall
- * point).
- */
- bd_wakeup((lodirtybuffers + hidirtybuffers) / 2);
-
- /*
* note: we cannot initiate I/O from a bdwrite even if we wanted to,
* due to the softdep code.
*/
@@ -1259,8 +1387,7 @@ bdirty(struct buf *bp)
if ((bp->b_flags & B_DELWRI) == 0) {
bp->b_flags |= /* XXX B_DONE | */ B_DELWRI;
reassignbuf(bp);
- atomic_add_int(&numdirtybuffers, 1);
- bd_wakeup((lodirtybuffers + hidirtybuffers) / 2);
+ bdirtyadd();
}
}
@@ -1288,8 +1415,7 @@ bundirty(struct buf *bp)
if (bp->b_flags & B_DELWRI) {
bp->b_flags &= ~B_DELWRI;
reassignbuf(bp);
- atomic_subtract_int(&numdirtybuffers, 1);
- numdirtywakeup(lodirtybuffers);
+ bdirtysub();
}
/*
* Since it is now being written, we can clear its deferred write flag.
@@ -1357,20 +1483,18 @@ bbarrierwrite(struct buf *bp)
* of any vnodes we attempt to avoid the situation where a locked vnode
* prevents the various system daemons from flushing related buffers.
*/
-
void
bwillwrite(void)
{
if (numdirtybuffers >= hidirtybuffers) {
- mtx_lock(&nblock);
+ mtx_lock(&bdirtylock);
while (numdirtybuffers >= hidirtybuffers) {
- bd_wakeup(1);
- needsbuffer |= VFS_BIO_NEED_DIRTYFLUSH;
- msleep(&needsbuffer, &nblock,
- (PRIBIO + 4), "flswai", 0);
+ bdirtywait = 1;
+ msleep(&bdirtywait, &bdirtylock, (PRIBIO + 4),
+ "flswai", 0);
}
- mtx_unlock(&nblock);
+ mtx_unlock(&bdirtylock);
}
}
@@ -1403,6 +1527,8 @@ buf_vm_page_count_severe(void)
void
brelse(struct buf *bp)
{
+ int qindex;
+
CTR3(KTR_BUF, "brelse(%p) vp %p flags %X",
bp, bp->b_vp, bp->b_flags);
KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
@@ -1441,10 +1567,8 @@ brelse(struct buf *bp)
bp->b_flags |= B_INVAL;
if (!LIST_EMPTY(&bp->b_dep))
buf_deallocate(bp);
- if (bp->b_flags & B_DELWRI) {
- atomic_subtract_int(&numdirtybuffers, 1);
- numdirtywakeup(lodirtybuffers);
- }
+ if (bp->b_flags & B_DELWRI)
+ bdirtysub();
bp->b_flags &= ~(B_DELWRI | B_CACHE);
if ((bp->b_flags & B_VMIO) == 0) {
if (bp->b_bufsize)
@@ -1591,15 +1715,6 @@ brelse(struct buf *bp)
brelvp(bp);
}
- /* enqueue */
- mtx_lock(&bqlock);
- /* Handle delayed bremfree() processing. */
- if (bp->b_flags & B_REMFREE)
- bremfreel(bp);
-
- if (bp->b_qindex != QUEUE_NONE)
- panic("brelse: free buffer onto another queue???");
-
/*
* If the buffer has junk contents signal it and eventually
* clean up B_DELWRI and diassociate the vnode so that gbincore()
@@ -1620,54 +1735,26 @@ brelse(struct buf *bp)
bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA);
if (bp->b_vflags & BV_BKGRDINPROG)
panic("losing buffer 1");
- if (bp->b_kvasize) {
- bp->b_qindex = QUEUE_EMPTYKVA;
- } else {
- bp->b_qindex = QUEUE_EMPTY;
- }
- TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
+ if (bp->b_kvasize)
+ qindex = QUEUE_EMPTYKVA;
+ else
+ qindex = QUEUE_EMPTY;
+ bp->b_flags |= B_AGE;
/* buffers with junk contents */
} else if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) ||
(bp->b_ioflags & BIO_ERROR)) {
bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA);
if (bp->b_vflags & BV_BKGRDINPROG)
panic("losing buffer 2");
- bp->b_qindex = QUEUE_CLEAN;
- TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
+ qindex = QUEUE_CLEAN;
+ bp->b_flags |= B_AGE;
/* remaining buffers */
- } else {
- if (bp->b_flags & B_DELWRI)
- bp->b_qindex = QUEUE_DIRTY;
- else
- bp->b_qindex = QUEUE_CLEAN;
- if (bp->b_flags & B_AGE) {
- TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp,
- b_freelist);
- } else {
- TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp,
- b_freelist);
- }
- }
-#ifdef INVARIANTS
- bq_len[bp->b_qindex]++;
-#endif
- mtx_unlock(&bqlock);
-
- /*
- * Fixup numfreebuffers count. The bp is on an appropriate queue
- * unless locked. We then bump numfreebuffers if it is not B_DELWRI.
- * We've already handled the B_INVAL case ( B_DELWRI will be clear
- * if B_INVAL is set ).
- */
-
- if (!(bp->b_flags & B_DELWRI))
- bufcountwakeup(bp);
+ } else if (bp->b_flags & B_DELWRI)
+ qindex = QUEUE_DIRTY;
+ else
+ qindex = QUEUE_CLEAN;
- /*
- * Something we can maybe free or reuse
- */
- if (bp->b_bufsize || bp->b_kvasize)
- bufspacewakeup();
+ binsfree(bp, qindex);
bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT);
if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY))
@@ -1690,7 +1777,7 @@ brelse(struct buf *bp)
void
bqrelse(struct buf *bp)
{
- struct bufobj *bo;
+ int qindex;
CTR3(KTR_BUF, "bqrelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
@@ -1701,71 +1788,40 @@ bqrelse(struct buf *bp)
BUF_UNLOCK(bp);
return;
}
+ bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
- bo = bp->b_bufobj;
if (bp->b_flags & B_MANAGED) {
- if (bp->b_flags & B_REMFREE) {
- mtx_lock(&bqlock);
- bremfreel(bp);
- mtx_unlock(&bqlock);
- }
- bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
- BUF_UNLOCK(bp);
- return;
+ if (bp->b_flags & B_REMFREE)
+ bremfreef(bp);
+ goto out;
}
- mtx_lock(&bqlock);
- /* Handle delayed bremfree() processing. */
- if (bp->b_flags & B_REMFREE)
- bremfreel(bp);
-
- if (bp->b_qindex != QUEUE_NONE)
- panic("bqrelse: free buffer onto another queue???");
/* buffers with stale but valid contents */
if (bp->b_flags & B_DELWRI) {
- bp->b_qindex = QUEUE_DIRTY;
- TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
-#ifdef INVARIANTS
- bq_len[bp->b_qindex]++;
-#endif
+ qindex = QUEUE_DIRTY;
} else {
+ if ((bp->b_flags & B_DELWRI) == 0 &&
+ (bp->b_xflags & BX_VNDIRTY))
+ panic("bqrelse: not dirty");
/*
* BKGRDINPROG can only be set with the buf and bufobj
* locks both held. We tolerate a race to clear it here.
*/
- if (!buf_vm_page_count_severe() ||
- (bp->b_vflags & BV_BKGRDINPROG)) {
- bp->b_qindex = QUEUE_CLEAN;
- TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp,
- b_freelist);
-#ifdef INVARIANTS
- bq_len[QUEUE_CLEAN]++;
-#endif
- } else {
+ if (buf_vm_page_count_severe() &&
+ (bp->b_vflags & BV_BKGRDINPROG) == 0) {
/*
* We are too low on memory, we have to try to free
* the buffer (most importantly: the wired pages
* making up its backing store) *now*.
*/
- mtx_unlock(&bqlock);
brelse(bp);
return;
}
+ qindex = QUEUE_CLEAN;
}
- mtx_unlock(&bqlock);
-
- if ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))
- bufcountwakeup(bp);
+ binsfree(bp, qindex);
- /*
- * Something we can maybe free or reuse.
- */
- if (bp->b_bufsize && !(bp->b_flags & B_DELWRI))
- bufspacewakeup();
-
- bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
- if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY))
- panic("bqrelse: not dirty");
+out:
/* unlock */
BUF_UNLOCK(bp);
}
@@ -2000,7 +2056,7 @@ getnewbuf_bufd_help(struct vnode *vp, int gbflags, int slpflag, int slptimeo,
char *waitmsg;
int fl, flags, norunbuf;
- mtx_assert(&bqlock, MA_OWNED);
+ mtx_assert(&bqclean, MA_OWNED);
if (defrag) {
flags = VFS_BIO_NEED_BUFSPACE;
@@ -2015,7 +2071,7 @@ getnewbuf_bufd_help(struct vnode *vp, int gbflags, int slpflag, int slptimeo,
mtx_lock(&nblock);
needsbuffer |= flags;
mtx_unlock(&nblock);
- mtx_unlock(&bqlock);
+ mtx_unlock(&bqclean);
bd_speedup(); /* heeeelp */
if ((gbflags & GB_NOWAIT_BD) != 0)
@@ -2038,7 +2094,7 @@ getnewbuf_bufd_help(struct vnode *vp, int gbflags, int slpflag, int slptimeo,
(td->td_pflags & TDP_NORUNNINGBUF);
/* play bufdaemon */
td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF;
- fl = buf_do_flush(vp);
+ fl = buf_flush(vp, flushbufqtarget);
td->td_pflags &= norunbuf;
mtx_lock(&nblock);
if (fl != 0)
@@ -2060,7 +2116,7 @@ getnewbuf_reuse_bp(struct buf *bp, int qindex)
CTR6(KTR_BUF, "getnewbuf(%p) vp %p flags %X kvasize %d bufsize %d "
"queue %d (recycling)", bp, bp->b_vp, bp->b_flags,
bp->b_kvasize, bp->b_bufsize, qindex);
- mtx_assert(&bqlock, MA_NOTOWNED);
+ mtx_assert(&bqclean, MA_NOTOWNED);
/*
* Note: we no longer distinguish between VMIO and non-VMIO
@@ -2156,7 +2212,7 @@ restart:
* where we cannot backup.
*/
nbp = NULL;
- mtx_lock(&bqlock);
+ mtx_lock(&bqclean);
if (!defrag && unmapped) {
nqindex = QUEUE_EMPTY;
nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
@@ -2267,14 +2323,14 @@ restart:
("getnewbuf: inconsistent queue %d bp %p", qindex, bp));
bremfreel(bp);
- mtx_unlock(&bqlock);
+ mtx_unlock(&bqclean);
/*
* NOTE: nbp is now entirely invalid. We can only restart
* the scan from this point on.
*/
getnewbuf_reuse_bp(bp, qindex);
- mtx_assert(&bqlock, MA_NOTOWNED);
+ mtx_assert(&bqclean, MA_NOTOWNED);
/*
* If we are defragging then free the buffer.
@@ -2335,10 +2391,6 @@ restart:
* We have insufficient buffer space
* buffer_map is too fragmented ( space reservation fails )
* If we have to flush dirty buffers ( but we try to avoid this )
- *
- * To avoid VFS layer recursion we do not flush dirty buffers ourselves.
- * Instead we ask the buf daemon to do it for us. We attempt to
- * avoid piecemeal wakeups of the pageout daemon.
*/
static struct buf *
getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int size, int maxsize,
@@ -2379,11 +2431,11 @@ restart:
* Generally we are sleeping due to insufficient buffer space.
*/
if (bp == NULL) {
- mtx_assert(&bqlock, MA_OWNED);
+ mtx_assert(&bqclean, MA_OWNED);
getnewbuf_bufd_help(vp, gbflags, slpflag, slptimeo, defrag);
- mtx_assert(&bqlock, MA_NOTOWNED);
+ mtx_assert(&bqclean, MA_NOTOWNED);
} else if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == GB_UNMAPPED) {
- mtx_assert(&bqlock, MA_NOTOWNED);
+ mtx_assert(&bqclean, MA_NOTOWNED);
bfreekva(bp);
bp->b_flags |= B_UNMAPPED;
@@ -2393,7 +2445,7 @@ restart:
atomic_add_long(&unmapped_bufspace, bp->b_kvasize);
atomic_add_int(&bufreusecnt, 1);
} else {
- mtx_assert(&bqlock, MA_NOTOWNED);
+ mtx_assert(&bqclean, MA_NOTOWNED);
/*
* We finally have a valid bp. We aren't quite out of the
@@ -2464,18 +2516,20 @@ static struct kproc_desc buf_kp = {
SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp);
static int
-buf_do_flush(struct vnode *vp)
+buf_flush(struct vnode *vp, int target)
{
int flushed;
- flushed = flushbufqueues(vp, QUEUE_DIRTY, 0);
+ flushed = flushbufqueues(vp, target, 0);
if (flushed == 0) {
/*
* Could not find any buffers without rollback
* dependencies, so just write the first one
* in the hopes of eventually making progress.
*/
- flushbufqueues(vp, QUEUE_DIRTY, 1);
+ if (vp != NULL && target > 2)
+ target /= 2;
+ flushbufqueues(vp, target, 1);
}
return (flushed);
}
@@ -2483,7 +2537,7 @@ buf_do_flush(struct vnode *vp)
static void
buf_daemon()
{
- int lodirtysave;
+ int lodirty;
/*
* This process needs to be suspended prior to shutdown sync.
@@ -2501,23 +2555,21 @@ buf_daemon()
mtx_unlock(&bdlock);
kproc_suspend_check(bufdaemonproc);
- lodirtysave = lodirtybuffers;
+ lodirty = lodirtybuffers;
if (bd_speedupreq) {
- lodirtybuffers = numdirtybuffers / 2;
+ lodirty = numdirtybuffers / 2;
bd_speedupreq = 0;
}
/*
* Do the flush. Limit the amount of in-transit I/O we
* allow to build up, otherwise we would completely saturate
- * the I/O system. Wakeup any waiting processes before we
- * normally would so they can run in parallel with our drain.
+ * the I/O system.
*/
- while (numdirtybuffers > lodirtybuffers) {
- if (buf_do_flush(NULL) == 0)
+ while (numdirtybuffers > lodirty) {
+ if (buf_flush(NULL, numdirtybuffers - lodirty) == 0)
break;
kern_yield(PRI_USER);
}
- lodirtybuffers = lodirtysave;
/*
* Only clear bd_request if we have reached our low water
@@ -2526,8 +2578,8 @@ buf_daemon()
* built up, within reason.
*
* If we were unable to hit our low water mark and couldn't
- * find any flushable buffers, we sleep half a second.
- * Otherwise we loop immediately.
+ * find any flushable buffers, we sleep for a short period
+ * to avoid endless loops on unlockable buffers.
*/
mtx_lock(&bdlock);
if (numdirtybuffers <= lodirtybuffers) {
@@ -2537,6 +2589,14 @@ buf_daemon()
* The sleep is just so the suspend code works.
*/
bd_request = 0;
+ /*
+ * Do an extra wakeup in case dirty threshold
+ * changed via sysctl and the explicit transition
+ * out of shortfall was missed.
+ */
+ bdirtywakeup();
+ if (runningbufspace <= lorunningspace)
+ runningwakeup();
msleep(&bd_request, &bdlock, PVM, "psleep", hz);
} else {
/*
@@ -2561,7 +2621,7 @@ SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW, &flushwithdeps,
0, "Number of buffers flushed with dependecies that require rollbacks");
static int
-flushbufqueues(struct vnode *lvp, int queue, int flushdeps)
+flushbufqueues(struct vnode *lvp, int target, int flushdeps)
{
struct buf *sentinel;
struct vnode *vp;
@@ -2569,19 +2629,14 @@ flushbufqueues(struct vnode *lvp, int queue, int flushdeps)
struct buf *bp;
int hasdeps;
int flushed;
- int target;
+ int queue;
- if (lvp == NULL) {
- target = numdirtybuffers - lodirtybuffers;
- if (flushdeps && target > 2)
- target /= 2;
- } else
- target = flushbufqtarget;
flushed = 0;
+ queue = QUEUE_DIRTY;
bp = NULL;
sentinel = malloc(sizeof(struct buf), M_TEMP, M_WAITOK | M_ZERO);
sentinel->b_qindex = QUEUE_SENTINEL;
- mtx_lock(&bqlock);
+ mtx_lock(&bqdirty);
TAILQ_INSERT_HEAD(&bufqueues[queue], sentinel, b_freelist);
while (flushed != target) {
bp = TAILQ_NEXT(sentinel, b_freelist);
@@ -2620,11 +2675,10 @@ flushbufqueues(struct vnode *lvp, int queue, int flushdeps)
}
if (bp->b_flags & B_INVAL) {
bremfreel(bp);
- mtx_unlock(&bqlock);
+ mtx_unlock(&bqdirty);
brelse(bp);
flushed++;
- numdirtywakeup((lodirtybuffers + hidirtybuffers) / 2);
- mtx_lock(&bqlock);
+ mtx_lock(&bqdirty);
continue;
}
@@ -2652,7 +2706,7 @@ flushbufqueues(struct vnode *lvp, int queue, int flushdeps)
continue;
}
if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_CANRECURSE) == 0) {
- mtx_unlock(&bqlock);
+ mtx_unlock(&bqdirty);
CTR3(KTR_BUF, "flushbufqueue(%p) vp %p flags %X",
bp, bp->b_vp, bp->b_flags);
if (curproc == bufdaemonproc)
@@ -2660,7 +2714,7 @@ flushbufqueues(struct vnode *lvp, int queue, int flushdeps)
else {
bremfree(bp);
bwrite(bp);
- notbufdflashes++;
+ notbufdflushes++;
}
vn_finished_write(mp);
VOP_UNLOCK(vp, 0);
@@ -2671,17 +2725,17 @@ flushbufqueues(struct vnode *lvp, int queue, int flushdeps)
* Sleeping on runningbufspace while holding
* vnode lock leads to deadlock.
*/
- if (curproc == bufdaemonproc)
+ if (curproc == bufdaemonproc &&
+ runningbufspace > hirunningspace)
waitrunningbufspace();
- numdirtywakeup((lodirtybuffers + hidirtybuffers) / 2);
- mtx_lock(&bqlock);
+ mtx_lock(&bqdirty);
continue;
}
vn_finished_write(mp);
BUF_UNLOCK(bp);
}
TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist);
- mtx_unlock(&bqlock);
+ mtx_unlock(&bqdirty);
free(sentinel, M_TEMP);
return (flushed);
}
@@ -2994,22 +3048,6 @@ getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo,
bo = &vp->v_bufobj;
loop:
- /*
- * Block if we are low on buffers. Certain processes are allowed
- * to completely exhaust the buffer cache.
- *
- * If this check ever becomes a bottleneck it may be better to
- * move it into the else, when gbincore() fails. At the moment
- * it isn't a problem.
- */
- if (numfreebuffers == 0) {
- if (TD_IS_IDLETHREAD(curthread))
- return NULL;
- mtx_lock(&nblock);
- needsbuffer |= VFS_BIO_NEED_ANY;
- mtx_unlock(&nblock);
- }
-
BO_RLOCK(bo);
bp = gbincore(bo, blkno);
if (bp != NULL) {
@@ -3154,6 +3192,9 @@ loop:
*/
if (flags & GB_NOCREAT)
return NULL;
+ if (numfreebuffers == 0 && TD_IS_IDLETHREAD(curthread))
+ return NULL;
+
bsize = vn_isdisk(vp, NULL) ? DEV_BSIZE : bo->bo_bsize;
offset = blkno * bsize;
vmio = vp->v_object != NULL;
OpenPOWER on IntegriCloud