- Consolidate duplicate code into support functions.

- Split the bqlock into bqclean and bqdirty locks. - Only acquire the wakeup synchronization locks when we cross a threshold requiring them. - Restructure the way flushbufqueues() targets work so they are more smp friendly and sane. Reviewed by: kib Discussed with: mckusick, attilio Sponsored by: EMC / Isilon Storage Division M vfs_bio.c
author: jeff <jeff@FreeBSD.org> 2013-06-05 23:53:00 +0000
committer: jeff <jeff@FreeBSD.org> 2013-06-05 23:53:00 +0000
commit: 38a637b8a0c41db72b88be9e4e6962f2981be080 (patch)
tree: 312fac452bdd477ef171cb7c662de6b82069bf35 /sys/kern
parent: dea53e7c2f1e0765e02b0f5867388d9325b71e43 (diff)
download: FreeBSD-src-38a637b8a0c41db72b88be9e4e6962f2981be080.zip
FreeBSD-src-38a637b8a0c41db72b88be9e4e6962f2981be080.tar.gz
1 files changed, 305 insertions, 264 deletions
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
index 03ba78c..767ed30 100644
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@@ -113,10 +113,11 @@ static void vfs_setdirty_locked_object(struct buf *bp);
 static void vfs_vmio_release(struct buf *bp);
 static int vfs_bio_clcheck(struct vnode *vp, int size,
 		daddr_t lblkno, daddr_t blkno);
-static int buf_do_flush(struct vnode *vp);
+static int buf_flush(struct vnode *vp, int);
 static int flushbufqueues(struct vnode *, int, int);
 static void buf_daemon(void);
 static void bremfreel(struct buf *bp);
+static __inline void bd_wakeup(void);
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 static int sysctl_bufspace(SYSCTL_HANDLER_ARGS);
@@ -217,8 +218,8 @@ SYSCTL_INT(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RW, &mappingrestarts, 0,
 static int flushbufqtarget = 100;
 SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0,
     "Amount of work to do in flushbufqueues when helping bufdaemon");
-static long notbufdflashes;
-SYSCTL_LONG(_vfs, OID_AUTO, notbufdflashes, CTLFLAG_RD, &notbufdflashes, 0,
+static long notbufdflushes;
+SYSCTL_LONG(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, &notbufdflushes, 0,
     "Number of dirty buffer flushes done by the bufdaemon helpers");
 static long barrierwrites;
 SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW, &barrierwrites, 0,
@@ -228,6 +229,37 @@ SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_allowed, CTLFLAG_RD,
     "Permit the use of the unmapped i/o");
 
 /*
+ * Lock for the non-dirty bufqueues
+ */
+static struct mtx_padalign bqclean;
+
+/*
+ * Lock for the dirty queue.
+ */
+static struct mtx_padalign bqdirty;
+
+/*
+ * This lock synchronizes access to bd_request.
+ */
+static struct mtx_padalign bdlock;
+
+/*
+ * This lock protects the runningbufreq and synchronizes runningbufwakeup and
+ * waitrunningbufspace().
+ */
+static struct mtx_padalign rbreqlock;
+
+/*
+ * Lock that protects needsbuffer and the sleeps/wakeups surrounding it.
+ */
+static struct mtx_padalign nblock;
+
+/*
+ * Lock that protects bdirtywait.
+ */
+static struct mtx_padalign bdirtylock;
+
+/*
  * Wakeup point for bufdaemon, as well as indicator of whether it is already
  * active.  Set to 1 when the bufdaemon is already "on" the queue, 0 when it
  * is idling.
@@ -243,11 +275,6 @@ static int bd_request;
 static int bd_speedupreq;
 
 /*
- * This lock synchronizes access to bd_request.
- */
-static struct mtx bdlock;
-
-/*
  * bogus page -- for I/O to/from partially complete buffers
  * this is a temporary solution to the problem, but it is not
  * really that bad.  it would be better to split the buffer
@@ -263,25 +290,19 @@ vm_page_t bogus_page;
  */
 static int runningbufreq;
 
-/*
- * This lock protects the runningbufreq and synchronizes runningbufwakeup and
- * waitrunningbufspace().
- */
-static struct mtx rbreqlock;
-
 /* 
  * Synchronization (sleep/wakeup) variable for buffer requests.
  * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done
  * by and/or.
- * Used in numdirtywakeup(), bufspacewakeup(), bufcountwakeup(), bwillwrite(),
+ * Used in numdirtywakeup(), bufspacewakeup(), bufcountadd(), bwillwrite(),
  * getnewbuf(), and getblk().
  */
 static int needsbuffer;
 
 /*
- * Lock that protects needsbuffer and the sleeps/wakeups surrounding it.
+ * Synchronization for bwillwrite() waiters.
  */
-static struct mtx nblock;
+static int bdirtywait;
 
 /*
  * Definitions for the buffer free lists.
@@ -301,9 +322,6 @@ static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } };
 static int bq_len[BUFFER_QUEUES];
 #endif
 
-/* Lock for the bufqueues */
-static struct mtx bqlock;
-
 /*
  * Single global constant for BUF_WMESG, to avoid getting multiple references.
  * buf_wmesg is referred from macros.
@@ -311,7 +329,6 @@ static struct mtx bqlock;
 const char *buf_wmesg = BUF_WMESG;
 
 #define VFS_BIO_NEED_ANY	0x01	/* any freeable buffer */
-#define VFS_BIO_NEED_DIRTYFLUSH	0x02	/* waiting for dirty buffer flush */
 #define VFS_BIO_NEED_FREE	0x04	/* wait for free bufs, hi hysteresis */
 #define VFS_BIO_NEED_BUFSPACE	0x08	/* wait for buf space, lo hysteresis */
 
@@ -337,25 +354,69 @@ sysctl_bufspace(SYSCTL_HANDLER_ARGS)
 #ifdef DIRECTIO
 extern void ffs_rawread_setup(void);
 #endif /* DIRECTIO */
+
 /*
- *	numdirtywakeup:
+ *	bqlock:
  *
- *	If someone is blocked due to there being too many dirty buffers,
- *	and numdirtybuffers is now reasonable, wake them up.
+ *	Return the appropriate queue lock based on the index.
  */
-
-static __inline void
-numdirtywakeup(int level)
+static inline struct mtx *
+bqlock(int qindex)
 {
 
-	if (numdirtybuffers <= level) {
-		mtx_lock(&nblock);
-		if (needsbuffer & VFS_BIO_NEED_DIRTYFLUSH) {
-			needsbuffer &= ~VFS_BIO_NEED_DIRTYFLUSH;
-			wakeup(&needsbuffer);
-		}
-		mtx_unlock(&nblock);
+	if (qindex == QUEUE_DIRTY)
+		return (struct mtx *)(&bqdirty);
+	return (struct mtx *)(&bqclean);
+}
+
+/*
+ *	bdirtywakeup:
+ *
+ *	Wakeup any bwillwrite() waiters.
+ */
+static void
+bdirtywakeup(void)
+{
+	mtx_lock(&bdirtylock);
+	if (bdirtywait) {
+		bdirtywait = 0;
+		wakeup(&bdirtywait);
 	}
+	mtx_unlock(&bdirtylock);
+}
+
+/*
+ *	bdirtysub:
+ *
+ *	Decrement the numdirtybuffers count by one and wakeup any
+ *	threads blocked in bwillwrite().
+ */
+static void
+bdirtysub(void)
+{
+
+	if (atomic_fetchadd_int(&numdirtybuffers, -1) ==
+	    (lodirtybuffers + hidirtybuffers) / 2)
+		bdirtywakeup();
+}
+
+/*
+ *	bdirtyadd:
+ *
+ *	Increment the numdirtybuffers count by one and wakeup the buf 
+ *	daemon if needed.
+ */
+static void
+bdirtyadd(void)
+{
+
+	/*
+	 * Only do the wakeup once as we cross the boundary.  The
+	 * buf daemon will keep running until the condition clears.
+	 */
+	if (atomic_fetchadd_int(&numdirtybuffers, 1) ==
+	    (lodirtybuffers + hidirtybuffers) / 2)
+		bd_wakeup();
 }
 
 /*
@@ -385,36 +446,59 @@ bufspacewakeup(void)
 }
 
 /*
- * runningbufwakeup() - in-progress I/O accounting.
+ *	runningwakeup:
  *
+ *	Wake up processes that are waiting on asynchronous writes to fall
+ *	below lorunningspace.
+ */
+static void
+runningwakeup(void)
+{
+
+	mtx_lock(&rbreqlock);
+	if (runningbufreq) {
+		runningbufreq = 0;
+		wakeup(&runningbufreq);
+	}
+	mtx_unlock(&rbreqlock);
+}
+
+/*
+ *	runningbufwakeup:
+ *
+ *	Decrement the outstanding write count according.
  */
 void
 runningbufwakeup(struct buf *bp)
 {
+	long space, bspace;
 
-	if (bp->b_runningbufspace) {
-		atomic_subtract_long(&runningbufspace, bp->b_runningbufspace);
-		bp->b_runningbufspace = 0;
-		mtx_lock(&rbreqlock);
-		if (runningbufreq && runningbufspace <= lorunningspace) {
-			runningbufreq = 0;
-			wakeup(&runningbufreq);
-		}
-		mtx_unlock(&rbreqlock);
-	}
+	if (bp->b_runningbufspace == 0)
+		return;
+	space = atomic_fetchadd_long(&runningbufspace, -bp->b_runningbufspace);
+	bspace = bp->b_runningbufspace;
+	bp->b_runningbufspace = 0;
+	/*
+	 * Only acquire the lock and wakeup on the transition from exceeding
+	 * the threshold to falling below it.
+	 */
+	if (space < lorunningspace)
+		return;
+	if (space - bspace > lorunningspace)
+		return;
+	runningwakeup();
 }
 
 /*
- *	bufcountwakeup:
+ *	bufcountadd:
  *
  *	Called when a buffer has been added to one of the free queues to
  *	account for the buffer and to wakeup anyone waiting for free buffers.
  *	This typically occurs when large amounts of metadata are being handled
  *	by the buffer cache ( else buffer space runs out first, usually ).
  */
-
 static __inline void
-bufcountwakeup(struct buf *bp) 
+bufcountadd(struct buf *bp)
 {
 	int old;
 
@@ -435,6 +519,30 @@ bufcountwakeup(struct buf *bp)
 }
 
 /*
+ *	bufcountsub:
+ *
+ *	Decrement the numfreebuffers count as needed.
+ */
+static void
+bufcountsub(struct buf *bp)
+{
+	int old;
+
+	/*
+	 * Fixup numfreebuffers count.  If the buffer is invalid or not
+	 * delayed-write, the buffer was free and we must decrement
+	 * numfreebuffers.
+	 */
+	if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) {
+		KASSERT((bp->b_flags & B_INFREECNT) != 0,
+		    ("buf %p not counted in numfreebuffers", bp));
+		bp->b_flags &= ~B_INFREECNT;
+		old = atomic_fetchadd_int(&numfreebuffers, -1);
+		KASSERT(old > 0, ("numfreebuffers dropped to %d", old - 1));
+	}
+}
+
+/*
  *	waitrunningbufspace()
  *
  *	runningbufspace is a measure of the amount of I/O currently
@@ -442,9 +550,6 @@ bufcountwakeup(struct buf *bp)
  *	prevent creating huge backups of pending writes to a device.
  *	Only asynchronous writes are governed by this function.
  *
- *	Reads will adjust runningbufspace, but will not block based on it.
- *	The read load has a side effect of reducing the allowed write load.
- *
  *	This does NOT turn an async write into a sync write.  It waits  
  *	for earlier writes to complete and generally returns before the
  *	caller's write has reached the device.
@@ -485,13 +590,12 @@ vfs_buf_test_cache(struct buf *bp,
 }
 
 /* Wake up the buffer daemon if necessary */
-static __inline
-void
-bd_wakeup(int dirtybuflevel)
+static __inline void
+bd_wakeup(void)
 {
 
 	mtx_lock(&bdlock);
-	if (bd_request == 0 && numdirtybuffers >= dirtybuflevel) {
+	if (bd_request == 0) {
 		bd_request = 1;
 		wakeup(&bd_request);
 	}
@@ -501,7 +605,6 @@ bd_wakeup(int dirtybuflevel)
 /*
  * bd_speedup - speedup the buffer cache flushing code
  */
-
 void
 bd_speedup(void)
 {
@@ -656,10 +759,12 @@ bufinit(void)
 	struct buf *bp;
 	int i;
 
-	mtx_init(&bqlock, "buf queue lock", NULL, MTX_DEF);
+	mtx_init(&bqclean, "bufq clean lock", NULL, MTX_DEF);
+	mtx_init(&bqdirty, "bufq dirty lock", NULL, MTX_DEF);
 	mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF);
 	mtx_init(&nblock, "needsbuffer lock", NULL, MTX_DEF);
 	mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF);
+	mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF);
 
 	/* next, make a null set of free lists */
 	for (i = 0; i < BUFFER_QUEUES; i++)
@@ -831,15 +936,60 @@ bfreekva(struct buf *bp)
 }
 
 /*
+ *	binsfree:
+ *
+ *	Insert the buffer into the appropriate free list.
+ */
+static void
+binsfree(struct buf *bp, int qindex)
+{
+	struct mtx *olock, *nlock;
+
+	BUF_ASSERT_XLOCKED(bp);
+
+	olock = bqlock(bp->b_qindex);
+	nlock = bqlock(qindex);
+	mtx_lock(olock);
+	/* Handle delayed bremfree() processing. */
+	if (bp->b_flags & B_REMFREE)
+		bremfreel(bp);
+
+	if (bp->b_qindex != QUEUE_NONE)
+		panic("binsfree: free buffer onto another queue???");
+
+	bp->b_qindex = qindex;
+	if (olock != nlock) {
+		mtx_unlock(olock);
+		mtx_lock(nlock);
+	}
+	if (bp->b_flags & B_AGE)
+		TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
+	else
+		TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
+#ifdef INVARIANTS
+	bq_len[bp->b_qindex]++;
+#endif
+	mtx_unlock(nlock);
+
+	/*
+	 * Something we can maybe free or reuse.
+	 */
+	if (bp->b_bufsize && !(bp->b_flags & B_DELWRI))
+		bufspacewakeup();
+
+	if ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))
+		bufcountadd(bp);
+}
+
+/*
  *	bremfree:
  *
- *	Mark the buffer for removal from the appropriate free list in brelse.
+ *	Mark the buffer for removal from the appropriate free list.
  *	
  */
 void
 bremfree(struct buf *bp)
 {
-	int old;
 
 	CTR3(KTR_BUF, "bremfree(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	KASSERT((bp->b_flags & B_REMFREE) == 0,
@@ -849,14 +999,7 @@ bremfree(struct buf *bp)
 	BUF_ASSERT_XLOCKED(bp);
 
 	bp->b_flags |= B_REMFREE;
-	/* Fixup numfreebuffers count.  */
-	if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) {
-		KASSERT((bp->b_flags & B_INFREECNT) != 0,
-		    ("buf %p not counted in numfreebuffers", bp));
-		bp->b_flags &= ~B_INFREECNT;
-		old = atomic_fetchadd_int(&numfreebuffers, -1);
-		KASSERT(old > 0, ("numfreebuffers dropped to %d", old - 1));
-	}
+	bufcountsub(bp);
 }
 
 /*
@@ -868,28 +1011,30 @@ bremfree(struct buf *bp)
 void
 bremfreef(struct buf *bp)
 {
-	mtx_lock(&bqlock);
+	struct mtx *qlock;
+
+	qlock = bqlock(bp->b_qindex);
+	mtx_lock(qlock);
 	bremfreel(bp);
-	mtx_unlock(&bqlock);
+	mtx_unlock(qlock);
 }
 
 /*
  *	bremfreel:
  *
  *	Removes a buffer from the free list, must be called with the
- *	bqlock held.
+ *	correct qlock held.
  */
 static void
 bremfreel(struct buf *bp)
 {
-	int old;
 
 	CTR3(KTR_BUF, "bremfreel(%p) vp %p flags %X",
 	    bp, bp->b_vp, bp->b_flags);
 	KASSERT(bp->b_qindex != QUEUE_NONE,
 	    ("bremfreel: buffer %p not on a queue.", bp));
 	BUF_ASSERT_XLOCKED(bp);
-	mtx_assert(&bqlock, MA_OWNED);
+	mtx_assert(bqlock(bp->b_qindex), MA_OWNED);
 
 	TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
 #ifdef INVARIANTS
@@ -906,18 +1051,7 @@ bremfreel(struct buf *bp)
 		bp->b_flags &= ~B_REMFREE;
 		return;
 	}
-	/*
-	 * Fixup numfreebuffers count.  If the buffer is invalid or not
-	 * delayed-write, the buffer was free and we must decrement
-	 * numfreebuffers.
-	 */
-	if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) {
-		KASSERT((bp->b_flags & B_INFREECNT) != 0,
-		    ("buf %p not counted in numfreebuffers", bp));
-		bp->b_flags &= ~B_INFREECNT;
-		old = atomic_fetchadd_int(&numfreebuffers, -1);
-		KASSERT(old > 0, ("numfreebuffers dropped to %d", old - 1));
-	}
+	bufcountsub(bp);
 }
 
 /*
@@ -1018,6 +1152,7 @@ bufwrite(struct buf *bp)
 {
 	int oldflags;
 	struct vnode *vp;
+	long space;
 	int vp_md;
 
 	CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
@@ -1065,7 +1200,7 @@ bufwrite(struct buf *bp)
 	 * Normal bwrites pipeline writes
 	 */
 	bp->b_runningbufspace = bp->b_bufsize;
-	atomic_add_long(&runningbufspace, bp->b_runningbufspace);
+	space = atomic_fetchadd_long(&runningbufspace, bp->b_runningbufspace);
 
 	if (!TD_IS_IDLETHREAD(curthread))
 		curthread->td_ru.ru_oublock++;
@@ -1078,7 +1213,7 @@ bufwrite(struct buf *bp)
 		int rtval = bufwait(bp);
 		brelse(bp);
 		return (rtval);
-	} else {
+	} else if (space > hirunningspace) {
 		/*
 		 * don't allow the async write to saturate the I/O
 		 * system.  We will not deadlock here because
@@ -1213,13 +1348,6 @@ bdwrite(struct buf *bp)
 	bqrelse(bp);
 
 	/*
-	 * Wakeup the buffer flushing daemon if we have a lot of dirty
-	 * buffers (midpoint between our recovery point and our stall
-	 * point).
-	 */
-	bd_wakeup((lodirtybuffers + hidirtybuffers) / 2);
-
-	/*
 	 * note: we cannot initiate I/O from a bdwrite even if we wanted to,
 	 * due to the softdep code.
 	 */
@@ -1259,8 +1387,7 @@ bdirty(struct buf *bp)
 	if ((bp->b_flags & B_DELWRI) == 0) {
 		bp->b_flags |= /* XXX B_DONE | */ B_DELWRI;
 		reassignbuf(bp);
-		atomic_add_int(&numdirtybuffers, 1);
-		bd_wakeup((lodirtybuffers + hidirtybuffers) / 2);
+		bdirtyadd();
 	}
 }
 
@@ -1288,8 +1415,7 @@ bundirty(struct buf *bp)
 	if (bp->b_flags & B_DELWRI) {
 		bp->b_flags &= ~B_DELWRI;
 		reassignbuf(bp);
-		atomic_subtract_int(&numdirtybuffers, 1);
-		numdirtywakeup(lodirtybuffers);
+		bdirtysub();
 	}
 	/*
 	 * Since it is now being written, we can clear its deferred write flag.
@@ -1357,20 +1483,18 @@ bbarrierwrite(struct buf *bp)
  *	of any vnodes we attempt to avoid the situation where a locked vnode
  *	prevents the various system daemons from flushing related buffers.
  */
-
 void
 bwillwrite(void)
 {
 
 	if (numdirtybuffers >= hidirtybuffers) {
-		mtx_lock(&nblock);
+		mtx_lock(&bdirtylock);
 		while (numdirtybuffers >= hidirtybuffers) {
-			bd_wakeup(1);
-			needsbuffer |= VFS_BIO_NEED_DIRTYFLUSH;
-			msleep(&needsbuffer, &nblock,
-			    (PRIBIO + 4), "flswai", 0);
+			bdirtywait = 1;
+			msleep(&bdirtywait, &bdirtylock, (PRIBIO + 4),
+			    "flswai", 0);
 		}
-		mtx_unlock(&nblock);
+		mtx_unlock(&bdirtylock);
 	}
 }
 
@@ -1403,6 +1527,8 @@ buf_vm_page_count_severe(void)
 void
 brelse(struct buf *bp)
 {
+	int qindex;
+
 	CTR3(KTR_BUF, "brelse(%p) vp %p flags %X",
 	    bp, bp->b_vp, bp->b_flags);
 	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
@@ -1441,10 +1567,8 @@ brelse(struct buf *bp)
 		bp->b_flags |= B_INVAL;
 		if (!LIST_EMPTY(&bp->b_dep))
 			buf_deallocate(bp);
-		if (bp->b_flags & B_DELWRI) {
-			atomic_subtract_int(&numdirtybuffers, 1);
-			numdirtywakeup(lodirtybuffers);
-		}
+		if (bp->b_flags & B_DELWRI)
+			bdirtysub();
 		bp->b_flags &= ~(B_DELWRI | B_CACHE);
 		if ((bp->b_flags & B_VMIO) == 0) {
 			if (bp->b_bufsize)
@@ -1591,15 +1715,6 @@ brelse(struct buf *bp)
 			brelvp(bp);
 	}
 			
-	/* enqueue */
-	mtx_lock(&bqlock);
-	/* Handle delayed bremfree() processing. */
-	if (bp->b_flags & B_REMFREE)
-		bremfreel(bp);
-
-	if (bp->b_qindex != QUEUE_NONE)
-		panic("brelse: free buffer onto another queue???");
-
 	/*
 	 * If the buffer has junk contents signal it and eventually
 	 * clean up B_DELWRI and diassociate the vnode so that gbincore()
@@ -1620,54 +1735,26 @@ brelse(struct buf *bp)
 		bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA);
 		if (bp->b_vflags & BV_BKGRDINPROG)
 			panic("losing buffer 1");
-		if (bp->b_kvasize) {
-			bp->b_qindex = QUEUE_EMPTYKVA;
-		} else {
-			bp->b_qindex = QUEUE_EMPTY;
-		}
-		TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
+		if (bp->b_kvasize)
+			qindex = QUEUE_EMPTYKVA;
+		else
+			qindex = QUEUE_EMPTY;
+		bp->b_flags |= B_AGE;
 	/* buffers with junk contents */
 	} else if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) ||
 	    (bp->b_ioflags & BIO_ERROR)) {
 		bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA);
 		if (bp->b_vflags & BV_BKGRDINPROG)
 			panic("losing buffer 2");
-		bp->b_qindex = QUEUE_CLEAN;
-		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
+		qindex = QUEUE_CLEAN;
+		bp->b_flags |= B_AGE;
 	/* remaining buffers */
-	} else {
-		if (bp->b_flags & B_DELWRI)
-			bp->b_qindex = QUEUE_DIRTY;
-		else
-			bp->b_qindex = QUEUE_CLEAN;
-		if (bp->b_flags & B_AGE) {
-			TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp,
-			    b_freelist);
-		} else {
-			TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp,
-			    b_freelist);
-		}
-	}
-#ifdef INVARIANTS
-	bq_len[bp->b_qindex]++;
-#endif
-	mtx_unlock(&bqlock);
-
-	/*
-	 * Fixup numfreebuffers count.  The bp is on an appropriate queue
-	 * unless locked.  We then bump numfreebuffers if it is not B_DELWRI.
-	 * We've already handled the B_INVAL case ( B_DELWRI will be clear
-	 * if B_INVAL is set ).
-	 */
-
-	if (!(bp->b_flags & B_DELWRI))
-		bufcountwakeup(bp);
+	} else if (bp->b_flags & B_DELWRI)
+		qindex = QUEUE_DIRTY;
+	else
+		qindex = QUEUE_CLEAN;
 
-	/*
-	 * Something we can maybe free or reuse
-	 */
-	if (bp->b_bufsize || bp->b_kvasize)
-		bufspacewakeup();
+	binsfree(bp, qindex);
 
 	bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT);
 	if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY))
@@ -1690,7 +1777,7 @@ brelse(struct buf *bp)
 void
 bqrelse(struct buf *bp)
 {
-	struct bufobj *bo;
+	int qindex;
 
 	CTR3(KTR_BUF, "bqrelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
@@ -1701,71 +1788,40 @@ bqrelse(struct buf *bp)
 		BUF_UNLOCK(bp);
 		return;
 	}
+	bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
 
-	bo = bp->b_bufobj;
 	if (bp->b_flags & B_MANAGED) {
-		if (bp->b_flags & B_REMFREE) {
-			mtx_lock(&bqlock);
-			bremfreel(bp);
-			mtx_unlock(&bqlock);
-		}
-		bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
-		BUF_UNLOCK(bp);
-		return;
+		if (bp->b_flags & B_REMFREE)
+			bremfreef(bp);
+		goto out;
 	}
 
-	mtx_lock(&bqlock);
-	/* Handle delayed bremfree() processing. */
-	if (bp->b_flags & B_REMFREE)
-		bremfreel(bp);
-
-	if (bp->b_qindex != QUEUE_NONE)
-		panic("bqrelse: free buffer onto another queue???");
 	/* buffers with stale but valid contents */
 	if (bp->b_flags & B_DELWRI) {
-		bp->b_qindex = QUEUE_DIRTY;
-		TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
-#ifdef INVARIANTS
-		bq_len[bp->b_qindex]++;
-#endif
+		qindex = QUEUE_DIRTY;
 	} else {
+		if ((bp->b_flags & B_DELWRI) == 0 &&
+		    (bp->b_xflags & BX_VNDIRTY))
+			panic("bqrelse: not dirty");
 		/*
 		 * BKGRDINPROG can only be set with the buf and bufobj
 		 * locks both held.  We tolerate a race to clear it here.
 		 */
-		if (!buf_vm_page_count_severe() ||
-		    (bp->b_vflags & BV_BKGRDINPROG)) {
-			bp->b_qindex = QUEUE_CLEAN;
-			TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp,
-			    b_freelist);
-#ifdef INVARIANTS
-			bq_len[QUEUE_CLEAN]++;
-#endif
-		} else {
+		if (buf_vm_page_count_severe() &&
+		    (bp->b_vflags & BV_BKGRDINPROG) == 0) {
 			/*
 			 * We are too low on memory, we have to try to free
 			 * the buffer (most importantly: the wired pages
 			 * making up its backing store) *now*.
 			 */
-			mtx_unlock(&bqlock);
 			brelse(bp);
 			return;
 		}
+		qindex = QUEUE_CLEAN;
 	}
-	mtx_unlock(&bqlock);
-
-	if ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))
-		bufcountwakeup(bp);
+	binsfree(bp, qindex);
 
-	/*
-	 * Something we can maybe free or reuse.
-	 */
-	if (bp->b_bufsize && !(bp->b_flags & B_DELWRI))
-		bufspacewakeup();
-
-	bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
-	if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY))
-		panic("bqrelse: not dirty");
+out:
 	/* unlock */
 	BUF_UNLOCK(bp);
 }
@@ -2000,7 +2056,7 @@ getnewbuf_bufd_help(struct vnode *vp, int gbflags, int slpflag, int slptimeo,
 	char *waitmsg;
 	int fl, flags, norunbuf;
 
-	mtx_assert(&bqlock, MA_OWNED);
+	mtx_assert(&bqclean, MA_OWNED);
 
 	if (defrag) {
 		flags = VFS_BIO_NEED_BUFSPACE;
@@ -2015,7 +2071,7 @@ getnewbuf_bufd_help(struct vnode *vp, int gbflags, int slpflag, int slptimeo,
 	mtx_lock(&nblock);
 	needsbuffer |= flags;
 	mtx_unlock(&nblock);
-	mtx_unlock(&bqlock);
+	mtx_unlock(&bqclean);
 
 	bd_speedup();	/* heeeelp */
 	if ((gbflags & GB_NOWAIT_BD) != 0)
@@ -2038,7 +2094,7 @@ getnewbuf_bufd_help(struct vnode *vp, int gbflags, int slpflag, int slptimeo,
 			    (td->td_pflags & TDP_NORUNNINGBUF);
 			/* play bufdaemon */
 			td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF;
-			fl = buf_do_flush(vp);
+			fl = buf_flush(vp, flushbufqtarget);
 			td->td_pflags &= norunbuf;
 			mtx_lock(&nblock);
 			if (fl != 0)
@@ -2060,7 +2116,7 @@ getnewbuf_reuse_bp(struct buf *bp, int qindex)
 	CTR6(KTR_BUF, "getnewbuf(%p) vp %p flags %X kvasize %d bufsize %d "
 	    "queue %d (recycling)", bp, bp->b_vp, bp->b_flags,
 	     bp->b_kvasize, bp->b_bufsize, qindex);
-	mtx_assert(&bqlock, MA_NOTOWNED);
+	mtx_assert(&bqclean, MA_NOTOWNED);
 
 	/*
 	 * Note: we no longer distinguish between VMIO and non-VMIO
@@ -2156,7 +2212,7 @@ restart:
 	 * where we cannot backup.
 	 */
 	nbp = NULL;
-	mtx_lock(&bqlock);
+	mtx_lock(&bqclean);
 	if (!defrag && unmapped) {
 		nqindex = QUEUE_EMPTY;
 		nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
@@ -2267,14 +2323,14 @@ restart:
 		    ("getnewbuf: inconsistent queue %d bp %p", qindex, bp));
 
 		bremfreel(bp);
-		mtx_unlock(&bqlock);
+		mtx_unlock(&bqclean);
 		/*
 		 * NOTE:  nbp is now entirely invalid.  We can only restart
 		 * the scan from this point on.
 		 */
 
 		getnewbuf_reuse_bp(bp, qindex);
-		mtx_assert(&bqlock, MA_NOTOWNED);
+		mtx_assert(&bqclean, MA_NOTOWNED);
 
 		/*
 		 * If we are defragging then free the buffer.
@@ -2335,10 +2391,6 @@ restart:
  *		We have insufficient buffer space
  *		buffer_map is too fragmented ( space reservation fails )
  *		If we have to flush dirty buffers ( but we try to avoid this )
- *
- *	To avoid VFS layer recursion we do not flush dirty buffers ourselves.
- *	Instead we ask the buf daemon to do it for us.  We attempt to
- *	avoid piecemeal wakeups of the pageout daemon.
  */
 static struct buf *
 getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int size, int maxsize,
@@ -2379,11 +2431,11 @@ restart:
 	 * Generally we are sleeping due to insufficient buffer space.
 	 */
 	if (bp == NULL) {
-		mtx_assert(&bqlock, MA_OWNED);
+		mtx_assert(&bqclean, MA_OWNED);
 		getnewbuf_bufd_help(vp, gbflags, slpflag, slptimeo, defrag);
-		mtx_assert(&bqlock, MA_NOTOWNED);
+		mtx_assert(&bqclean, MA_NOTOWNED);
 	} else if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == GB_UNMAPPED) {
-		mtx_assert(&bqlock, MA_NOTOWNED);
+		mtx_assert(&bqclean, MA_NOTOWNED);
 
 		bfreekva(bp);
 		bp->b_flags |= B_UNMAPPED;
@@ -2393,7 +2445,7 @@ restart:
 		atomic_add_long(&unmapped_bufspace, bp->b_kvasize);
 		atomic_add_int(&bufreusecnt, 1);
 	} else {
-		mtx_assert(&bqlock, MA_NOTOWNED);
+		mtx_assert(&bqclean, MA_NOTOWNED);
 
 		/*
 		 * We finally have a valid bp.  We aren't quite out of the
@@ -2464,18 +2516,20 @@ static struct kproc_desc buf_kp = {
 SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp);
 
 static int
-buf_do_flush(struct vnode *vp)
+buf_flush(struct vnode *vp, int target)
 {
 	int flushed;
 
-	flushed = flushbufqueues(vp, QUEUE_DIRTY, 0);
+	flushed = flushbufqueues(vp, target, 0);
 	if (flushed == 0) {
 		/*
 		 * Could not find any buffers without rollback
 		 * dependencies, so just write the first one
 		 * in the hopes of eventually making progress.
 		 */
-		flushbufqueues(vp, QUEUE_DIRTY, 1);
+		if (vp != NULL && target > 2)
+			target /= 2;
+		flushbufqueues(vp, target, 1);
 	}
 	return (flushed);
 }
@@ -2483,7 +2537,7 @@ buf_do_flush(struct vnode *vp)
 static void
 buf_daemon()
 {
-	int lodirtysave;
+	int lodirty;
 
 	/*
 	 * This process needs to be suspended prior to shutdown sync.
@@ -2501,23 +2555,21 @@ buf_daemon()
 		mtx_unlock(&bdlock);
 
 		kproc_suspend_check(bufdaemonproc);
-		lodirtysave = lodirtybuffers;
+		lodirty = lodirtybuffers;
 		if (bd_speedupreq) {
-			lodirtybuffers = numdirtybuffers / 2;
+			lodirty = numdirtybuffers / 2;
 			bd_speedupreq = 0;
 		}
 		/*
 		 * Do the flush.  Limit the amount of in-transit I/O we
 		 * allow to build up, otherwise we would completely saturate
-		 * the I/O system.  Wakeup any waiting processes before we
-		 * normally would so they can run in parallel with our drain.
+		 * the I/O system.
 		 */
-		while (numdirtybuffers > lodirtybuffers) {
-			if (buf_do_flush(NULL) == 0)
+		while (numdirtybuffers > lodirty) {
+			if (buf_flush(NULL, numdirtybuffers - lodirty) == 0)
 				break;
 			kern_yield(PRI_USER);
 		}
-		lodirtybuffers = lodirtysave;
 
 		/*
 		 * Only clear bd_request if we have reached our low water
@@ -2526,8 +2578,8 @@ buf_daemon()
 		 * built up, within reason.
 		 *
 		 * If we were unable to hit our low water mark and couldn't
-		 * find any flushable buffers, we sleep half a second.
-		 * Otherwise we loop immediately.
+		 * find any flushable buffers, we sleep for a short period
+		 * to avoid endless loops on unlockable buffers.
 		 */
 		mtx_lock(&bdlock);
 		if (numdirtybuffers <= lodirtybuffers) {
@@ -2537,6 +2589,14 @@ buf_daemon()
 			 * The sleep is just so the suspend code works.
 			 */
 			bd_request = 0;
+			/*
+			 * Do an extra wakeup in case dirty threshold
+			 * changed via sysctl and the explicit transition
+			 * out of shortfall was missed.
+			 */
+			bdirtywakeup();
+			if (runningbufspace <= lorunningspace)
+				runningwakeup();
 			msleep(&bd_request, &bdlock, PVM, "psleep", hz);
 		} else {
 			/*
@@ -2561,7 +2621,7 @@ SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW, &flushwithdeps,
     0, "Number of buffers flushed with dependecies that require rollbacks");
 
 static int
-flushbufqueues(struct vnode *lvp, int queue, int flushdeps)
+flushbufqueues(struct vnode *lvp, int target, int flushdeps)
 {
 	struct buf *sentinel;
 	struct vnode *vp;
@@ -2569,19 +2629,14 @@ flushbufqueues(struct vnode *lvp, int queue, int flushdeps)
 	struct buf *bp;
 	int hasdeps;
 	int flushed;
-	int target;
+	int queue;
 
-	if (lvp == NULL) {
-		target = numdirtybuffers - lodirtybuffers;
-		if (flushdeps && target > 2)
-			target /= 2;
-	} else
-		target = flushbufqtarget;
 	flushed = 0;
+	queue = QUEUE_DIRTY;
 	bp = NULL;
 	sentinel = malloc(sizeof(struct buf), M_TEMP, M_WAITOK | M_ZERO);
 	sentinel->b_qindex = QUEUE_SENTINEL;
-	mtx_lock(&bqlock);
+	mtx_lock(&bqdirty);
 	TAILQ_INSERT_HEAD(&bufqueues[queue], sentinel, b_freelist);
 	while (flushed != target) {
 		bp = TAILQ_NEXT(sentinel, b_freelist);
@@ -2620,11 +2675,10 @@ flushbufqueues(struct vnode *lvp, int queue, int flushdeps)
 		}
 		if (bp->b_flags & B_INVAL) {
 			bremfreel(bp);
-			mtx_unlock(&bqlock);
+			mtx_unlock(&bqdirty);
 			brelse(bp);
 			flushed++;
-			numdirtywakeup((lodirtybuffers + hidirtybuffers) / 2);
-			mtx_lock(&bqlock);
+			mtx_lock(&bqdirty);
 			continue;
 		}
 
@@ -2652,7 +2706,7 @@ flushbufqueues(struct vnode *lvp, int queue, int flushdeps)
 			continue;
 		}
 		if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_CANRECURSE) == 0) {
-			mtx_unlock(&bqlock);
+			mtx_unlock(&bqdirty);
 			CTR3(KTR_BUF, "flushbufqueue(%p) vp %p flags %X",
 			    bp, bp->b_vp, bp->b_flags);
 			if (curproc == bufdaemonproc)
@@ -2660,7 +2714,7 @@ flushbufqueues(struct vnode *lvp, int queue, int flushdeps)
 			else {
 				bremfree(bp);
 				bwrite(bp);
-				notbufdflashes++;
+				notbufdflushes++;
 			}
 			vn_finished_write(mp);
 			VOP_UNLOCK(vp, 0);
@@ -2671,17 +2725,17 @@ flushbufqueues(struct vnode *lvp, int queue, int flushdeps)
 			 * Sleeping on runningbufspace while holding
 			 * vnode lock leads to deadlock.
 			 */
-			if (curproc == bufdaemonproc)
+			if (curproc == bufdaemonproc &&
+			    runningbufspace > hirunningspace)
 				waitrunningbufspace();
-			numdirtywakeup((lodirtybuffers + hidirtybuffers) / 2);
-			mtx_lock(&bqlock);
+			mtx_lock(&bqdirty);
 			continue;
 		}
 		vn_finished_write(mp);
 		BUF_UNLOCK(bp);
 	}
 	TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist);
-	mtx_unlock(&bqlock);
+	mtx_unlock(&bqdirty);
 	free(sentinel, M_TEMP);
 	return (flushed);
 }
@@ -2994,22 +3048,6 @@ getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo,
 
 	bo = &vp->v_bufobj;
 loop:
-	/*
-	 * Block if we are low on buffers.   Certain processes are allowed
-	 * to completely exhaust the buffer cache.
-         *
-         * If this check ever becomes a bottleneck it may be better to
-         * move it into the else, when gbincore() fails.  At the moment
-         * it isn't a problem.
-         */
-	if (numfreebuffers == 0) {
-		if (TD_IS_IDLETHREAD(curthread))
-			return NULL;
-		mtx_lock(&nblock);
-		needsbuffer |= VFS_BIO_NEED_ANY;
-		mtx_unlock(&nblock);
-	}
-
 	BO_RLOCK(bo);
 	bp = gbincore(bo, blkno);
 	if (bp != NULL) {
@@ -3154,6 +3192,9 @@ loop:
 		 */
 		if (flags & GB_NOCREAT)
 			return NULL;
+		if (numfreebuffers == 0 && TD_IS_IDLETHREAD(curthread))
+			return NULL;
+
 		bsize = vn_isdisk(vp, NULL) ? DEV_BSIZE : bo->bo_bsize;
 		offset = blkno * bsize;
 		vmio = vp->v_object != NULL;
author	jeff <jeff@FreeBSD.org>	2013-06-05 23:53:00 +0000
committer	jeff <jeff@FreeBSD.org>	2013-06-05 23:53:00 +0000
commit	38a637b8a0c41db72b88be9e4e6962f2981be080 (patch)
tree	312fac452bdd477ef171cb7c662de6b82069bf35 /sys/kern
parent	dea53e7c2f1e0765e02b0f5867388d9325b71e43 (diff)
download	FreeBSD-src-38a637b8a0c41db72b88be9e4e6962f2981be080.zip FreeBSD-src-38a637b8a0c41db72b88be9e4e6962f2981be080.tar.gz