summaryrefslogtreecommitdiffstats
path: root/sys/kern
diff options
context:
space:
mode:
Diffstat (limited to 'sys/kern')
-rw-r--r--sys/kern/vfs_bio.c1286
1 files changed, 711 insertions, 575 deletions
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
index 70dc565..4cc9a4e 100644
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@@ -63,6 +63,7 @@ __FBSDID("$FreeBSD$");
#include <sys/proc.h>
#include <sys/resourcevar.h>
#include <sys/rwlock.h>
+#include <sys/smp.h>
#include <sys/sysctl.h>
#include <sys/sysproto.h>
#include <sys/vmem.h>
@@ -100,6 +101,7 @@ caddr_t unmapped_buf;
/* Used below and for softdep flushing threads in ufs/ffs/ffs_softdep.c */
struct proc *bufdaemonproc;
+struct proc *bufspacedaemonproc;
static int inmem(struct vnode *vp, daddr_t blkno);
static void vm_hold_free_pages(struct buf *bp, int newbsize);
@@ -116,11 +118,18 @@ static void vfs_vmio_extend(struct buf *bp, int npages, int size);
static int vfs_bio_clcheck(struct vnode *vp, int size,
daddr_t lblkno, daddr_t blkno);
static int buf_flush(struct vnode *vp, int);
+static int buf_recycle(bool);
+static int buf_scan(bool);
static int flushbufqueues(struct vnode *, int, int);
static void buf_daemon(void);
static void bremfreel(struct buf *bp);
static __inline void bd_wakeup(void);
static int sysctl_runningspace(SYSCTL_HANDLER_ARGS);
+static void bufkva_reclaim(vmem_t *, int);
+static void bufkva_free(struct buf *);
+static int buf_import(void *, void **, int, int);
+static void buf_release(void *, void **, int);
+
#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
static int sysctl_bufspace(SYSCTL_HANDLER_ARGS);
@@ -145,23 +154,23 @@ static long bufkvaspace;
SYSCTL_LONG(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace, 0,
"Kernel virtual memory used for buffers");
static long maxbufspace;
-SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0,
- "Maximum allowed value of bufspace (including buf_daemon)");
+SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW, &maxbufspace, 0,
+ "Maximum allowed value of bufspace (including metadata)");
static long bufmallocspace;
SYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0,
"Amount of malloced memory for buffers");
static long maxbufmallocspace;
-SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0,
- "Maximum amount of malloced memory for buffers");
+SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace,
+ 0, "Maximum amount of malloced memory for buffers");
static long lobufspace;
-SYSCTL_LONG(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0,
+SYSCTL_LONG(_vfs, OID_AUTO, lobufspace, CTLFLAG_RW, &lobufspace, 0,
"Minimum amount of buffers we want to have");
long hibufspace;
-SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0,
- "Maximum allowed value of bufspace (excluding buf_daemon)");
-static int bufreusecnt;
-SYSCTL_INT(_vfs, OID_AUTO, bufreusecnt, CTLFLAG_RW, &bufreusecnt, 0,
- "Number of times we have reused a buffer");
+SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RW, &hibufspace, 0,
+ "Maximum allowed value of bufspace (excluding metadata)");
+long bufspacethresh;
+SYSCTL_LONG(_vfs, OID_AUTO, bufspacethresh, CTLFLAG_RW, &bufspacethresh,
+ 0, "Bufspace consumed before waking the daemon to free some");
static int buffreekvacnt;
SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0,
"Number of times we have freed the KVA space from some buffer");
@@ -205,10 +214,10 @@ SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0,
"Number of free buffers");
static int lofreebuffers;
SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0,
- "XXX Unused");
+ "Target number of free buffers");
static int hifreebuffers;
SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0,
- "XXX Complicatedly unused");
+ "Threshold for clean buffer recycling");
static int getnewbufcalls;
SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0,
"Number of calls to getnewbuf");
@@ -219,6 +228,9 @@ static int mappingrestarts;
SYSCTL_INT(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RW, &mappingrestarts, 0,
"Number of times getblk has had to restart a buffer mapping for "
"unmapped buffer");
+static int numbufallocfails;
+SYSCTL_INT(_vfs, OID_AUTO, numbufallocfails, CTLFLAG_RW, &numbufallocfails, 0,
+ "Number of times buffer allocations failed");
static int flushbufqtarget = 100;
SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0,
"Amount of work to do in flushbufqueues when helping bufdaemon");
@@ -233,16 +245,6 @@ SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_allowed, CTLFLAG_RD,
"Permit the use of the unmapped i/o");
/*
- * Lock for the non-dirty bufqueues
- */
-static struct mtx_padalign bqclean;
-
-/*
- * Lock for the dirty queue.
- */
-static struct mtx_padalign bqdirty;
-
-/*
* This lock synchronizes access to bd_request.
*/
static struct mtx_padalign bdlock;
@@ -271,6 +273,11 @@ static struct mtx_padalign bdirtylock;
static int bd_request;
/*
+ * Request/wakeup point for the bufspace daemon.
+ */
+static int bufspace_request;
+
+/*
* Request for the buf daemon to write more buffers than is indicated by
* lodirtybuf. This may be necessary to push out excess dependencies or
* defragment the address space where a simple count of the number of dirty
@@ -298,7 +305,7 @@ static int runningbufreq;
* Synchronization (sleep/wakeup) variable for buffer requests.
* Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done
* by and/or.
- * Used in numdirtywakeup(), bufspacewakeup(), bufcountadd(), bwillwrite(),
+ * Used in numdirtywakeup(), bufspace_wakeup(), bwillwrite(),
* getnewbuf(), and getblk().
*/
static volatile int needsbuffer;
@@ -311,14 +318,21 @@ static int bdirtywait;
/*
* Definitions for the buffer free lists.
*/
-#define BUFFER_QUEUES 4 /* number of free buffer queues */
-
#define QUEUE_NONE 0 /* on no queue */
-#define QUEUE_CLEAN 1 /* non-B_DELWRI buffers */
+#define QUEUE_EMPTY 1 /* empty buffer headers */
#define QUEUE_DIRTY 2 /* B_DELWRI buffers */
-#define QUEUE_EMPTY 3 /* empty buffer headers */
+#define QUEUE_CLEAN 3 /* non-B_DELWRI buffers */
#define QUEUE_SENTINEL 1024 /* not an queue index, but mark for sentinel */
+/* Maximum number of clean buffer queues. */
+#define CLEAN_QUEUES 16
+
+/* Configured number of clean queues. */
+static int clean_queues;
+
+/* Maximum number of buffer queues. */
+#define BUFFER_QUEUES (QUEUE_CLEAN + CLEAN_QUEUES)
+
/* Queues for free buffers with various properties */
static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } };
#ifdef INVARIANTS
@@ -326,15 +340,21 @@ static int bq_len[BUFFER_QUEUES];
#endif
/*
+ * Lock for each bufqueue
+ */
+static struct mtx_padalign bqlocks[BUFFER_QUEUES];
+
+/*
+ * per-cpu empty buffer cache.
+ */
+uma_zone_t buf_zone;
+
+/*
* Single global constant for BUF_WMESG, to avoid getting multiple references.
* buf_wmesg is referred from macros.
*/
const char *buf_wmesg = BUF_WMESG;
-#define VFS_BIO_NEED_ANY 0x01 /* any freeable buffer */
-#define VFS_BIO_NEED_FREE 0x04 /* wait for free bufs, hi hysteresis */
-#define VFS_BIO_NEED_BUFSPACE 0x08 /* wait for buf space, lo hysteresis */
-
static int
sysctl_runningspace(SYSCTL_HANDLER_ARGS)
{
@@ -382,6 +402,21 @@ sysctl_bufspace(SYSCTL_HANDLER_ARGS)
}
#endif
+static int
+bqcleanq(void)
+{
+ static int nextq;
+
+ return ((atomic_fetchadd_int(&nextq, 1) % clean_queues) + QUEUE_CLEAN);
+}
+
+static int
+bqisclean(int qindex)
+{
+
+ return (qindex >= QUEUE_CLEAN && qindex < QUEUE_CLEAN + CLEAN_QUEUES);
+}
+
/*
* bqlock:
*
@@ -391,9 +426,7 @@ static inline struct mtx *
bqlock(int qindex)
{
- if (qindex == QUEUE_DIRTY)
- return (struct mtx *)(&bqdirty);
- return (struct mtx *)(&bqclean);
+ return (struct mtx *)&bqlocks[qindex];
}
/*
@@ -447,62 +480,255 @@ bdirtyadd(void)
}
/*
- * bufspacewakeup:
+ * bufspace_wakeup:
*
* Called when buffer space is potentially available for recovery.
* getnewbuf() will block on this flag when it is unable to free
* sufficient buffer space. Buffer space becomes recoverable when
* bp's get placed back in the queues.
*/
-static __inline void
-bufspacewakeup(void)
+static void
+bufspace_wakeup(void)
{
- int need_wakeup, on;
/*
- * If someone is waiting for bufspace, wake them up. Even
- * though we may not have freed the kva space yet, the waiting
- * process will be able to now.
+ * If someone is waiting for bufspace, wake them up.
+ *
+ * Since needsbuffer is set prior to doing an additional queue
+ * scan it is safe to check for the flag prior to acquiring the
+ * lock. The thread that is preparing to scan again before
+ * blocking would discover the buf we released.
*/
+ if (needsbuffer) {
+ rw_rlock(&nblock);
+ if (atomic_cmpset_int(&needsbuffer, 1, 0) == 1)
+ wakeup(__DEVOLATILE(void *, &needsbuffer));
+ rw_runlock(&nblock);
+ }
+}
+
+/*
+ * bufspace_daemonwakeup:
+ *
+ * Wakeup the daemon responsible for freeing clean bufs.
+ */
+static void
+bufspace_daemonwakeup(void)
+{
rw_rlock(&nblock);
- for (;;) {
- need_wakeup = 0;
- on = needsbuffer;
- if ((on & VFS_BIO_NEED_BUFSPACE) == 0)
- break;
- need_wakeup = 1;
- if (atomic_cmpset_rel_int(&needsbuffer, on,
- on & ~VFS_BIO_NEED_BUFSPACE))
- break;
+ if (bufspace_request == 0) {
+ bufspace_request = 1;
+ wakeup(&bufspace_request);
}
- if (need_wakeup)
- wakeup(__DEVOLATILE(void *, &needsbuffer));
rw_runlock(&nblock);
}
/*
- * bufspaceadjust:
+ * bufspace_adjust:
*
* Adjust the reported bufspace for a KVA managed buffer, possibly
* waking any waiters.
*/
static void
-bufspaceadjust(struct buf *bp, int bufsize)
+bufspace_adjust(struct buf *bp, int bufsize)
{
+ long space;
int diff;
KASSERT((bp->b_flags & B_MALLOC) == 0,
- ("bufspaceadjust: malloc buf %p", bp));
+ ("bufspace_adjust: malloc buf %p", bp));
diff = bufsize - bp->b_bufsize;
if (diff < 0) {
atomic_subtract_long(&bufspace, -diff);
- bufspacewakeup();
- } else
- atomic_add_long(&bufspace, diff);
+ bufspace_wakeup();
+ } else {
+ space = atomic_fetchadd_long(&bufspace, diff);
+ /* Wake up the daemon on the transition. */
+ if (space < bufspacethresh && space + diff >= bufspacethresh)
+ bufspace_daemonwakeup();
+ }
bp->b_bufsize = bufsize;
}
/*
+ * bufspace_reserve:
+ *
+ * Reserve bufspace before calling allocbuf(). metadata has a
+ * different space limit than data.
+ */
+static int
+bufspace_reserve(int size, bool metadata)
+{
+ long limit;
+ long space;
+
+ if (metadata)
+ limit = maxbufspace;
+ else
+ limit = hibufspace;
+ do {
+ space = bufspace;
+ if (space + size > limit)
+ return (ENOSPC);
+ } while (atomic_cmpset_long(&bufspace, space, space + size) == 0);
+
+ /* Wake up the daemon on the transition. */
+ if (space < bufspacethresh && space + size >= bufspacethresh)
+ bufspace_daemonwakeup();
+
+ return (0);
+}
+
+/*
+ * bufspace_release:
+ *
+ * Release reserved bufspace after bufspace_adjust() has consumed it.
+ */
+static void
+bufspace_release(int size)
+{
+ atomic_subtract_long(&bufspace, size);
+ bufspace_wakeup();
+}
+
+/*
+ * bufspace_wait:
+ *
+ * Wait for bufspace, acting as the buf daemon if a locked vnode is
+ * supplied. needsbuffer must be set in a safe fashion prior to
+ * polling for space. The operation must be re-tried on return.
+ */
+static void
+bufspace_wait(struct vnode *vp, int gbflags, int slpflag, int slptimeo)
+{
+ struct thread *td;
+ int error, fl, norunbuf;
+
+ if ((gbflags & GB_NOWAIT_BD) != 0)
+ return;
+
+ td = curthread;
+ rw_wlock(&nblock);
+ while (needsbuffer != 0) {
+ if (vp != NULL && vp->v_type != VCHR &&
+ (td->td_pflags & TDP_BUFNEED) == 0) {
+ rw_wunlock(&nblock);
+ /*
+ * getblk() is called with a vnode locked, and
+ * some majority of the dirty buffers may as
+ * well belong to the vnode. Flushing the
+ * buffers there would make a progress that
+ * cannot be achieved by the buf_daemon, that
+ * cannot lock the vnode.
+ */
+ norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) |
+ (td->td_pflags & TDP_NORUNNINGBUF);
+
+ /*
+ * Play bufdaemon. The getnewbuf() function
+ * may be called while the thread owns lock
+ * for another dirty buffer for the same
+ * vnode, which makes it impossible to use
+ * VOP_FSYNC() there, due to the buffer lock
+ * recursion.
+ */
+ td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF;
+ fl = buf_flush(vp, flushbufqtarget);
+ td->td_pflags &= norunbuf;
+ rw_wlock(&nblock);
+ if (fl != 0)
+ continue;
+ if (needsbuffer == 0)
+ break;
+ }
+ error = rw_sleep(__DEVOLATILE(void *, &needsbuffer), &nblock,
+ (PRIBIO + 4) | slpflag, "newbuf", slptimeo);
+ if (error != 0)
+ break;
+ }
+ rw_wunlock(&nblock);
+}
+
+
+/*
+ * bufspace_daemon:
+ *
+ * buffer space management daemon. Tries to maintain some marginal
+ * amount of free buffer space so that requesting processes neither
+ * block nor work to reclaim buffers.
+ */
+static void
+bufspace_daemon(void)
+{
+ for (;;) {
+ kproc_suspend_check(bufspacedaemonproc);
+
+ /*
+ * Free buffers from the clean queue until we meet our
+ * targets.
+ *
+ * Theory of operation: The buffer cache is most efficient
+ * when some free buffer headers and space are always
+ * available to getnewbuf(). This daemon attempts to prevent
+ * the excessive blocking and synchronization associated
+ * with shortfall. It goes through three phases according
+ * demand:
+ *
+ * 1) The daemon wakes up voluntarily once per-second
+ * during idle periods when the counters are below
+ * the wakeup thresholds (bufspacethresh, lofreebuffers).
+ *
+ * 2) The daemon wakes up as we cross the thresholds
+ * ahead of any potential blocking. This may bounce
+ * slightly according to the rate of consumption and
+ * release.
+ *
+ * 3) The daemon and consumers are starved for working
+ * clean buffers. This is the 'bufspace' sleep below
+ * which will inefficiently trade bufs with bqrelse
+ * until we return to condition 2.
+ */
+ while (bufspace > lobufspace ||
+ numfreebuffers < hifreebuffers) {
+ if (buf_recycle(false) != 0) {
+ atomic_set_int(&needsbuffer, 1);
+ if (buf_recycle(false) != 0) {
+ rw_wlock(&nblock);
+ if (needsbuffer)
+ rw_sleep(__DEVOLATILE(void *,
+ &needsbuffer), &nblock,
+ PRIBIO|PDROP, "bufspace",
+ hz/10);
+ else
+ rw_wunlock(&nblock);
+ }
+ }
+ maybe_yield();
+ }
+
+ /*
+ * Re-check our limits under the exclusive nblock.
+ */
+ rw_wlock(&nblock);
+ if (bufspace < bufspacethresh &&
+ numfreebuffers > lofreebuffers) {
+ bufspace_request = 0;
+ rw_sleep(&bufspace_request, &nblock, PRIBIO|PDROP,
+ "-", hz);
+ } else
+ rw_wunlock(&nblock);
+ }
+}
+
+static struct kproc_desc bufspace_kp = {
+ "bufspacedaemon",
+ bufspace_daemon,
+ &bufspacedaemonproc
+};
+SYSINIT(bufspacedaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start,
+ &bufspace_kp);
+
+/*
* bufmallocadjust:
*
* Adjust the reported bufspace for a malloc managed buffer, possibly
@@ -516,10 +742,9 @@ bufmallocadjust(struct buf *bp, int bufsize)
KASSERT((bp->b_flags & B_MALLOC) != 0,
("bufmallocadjust: non-malloc buf %p", bp));
diff = bufsize - bp->b_bufsize;
- if (diff < 0) {
+ if (diff < 0)
atomic_subtract_long(&bufmallocspace, -diff);
- bufspacewakeup();
- } else
+ else
atomic_add_long(&bufmallocspace, diff);
bp->b_bufsize = bufsize;
}
@@ -571,67 +796,6 @@ runningbufwakeup(struct buf *bp)
}
/*
- * bufcountadd:
- *
- * Called when a buffer has been added to one of the free queues to
- * account for the buffer and to wakeup anyone waiting for free buffers.
- * This typically occurs when large amounts of metadata are being handled
- * by the buffer cache ( else buffer space runs out first, usually ).
- */
-static __inline void
-bufcountadd(struct buf *bp)
-{
- int mask, need_wakeup, old, on;
-
- KASSERT((bp->b_flags & B_INFREECNT) == 0,
- ("buf %p already counted as free", bp));
- bp->b_flags |= B_INFREECNT;
- old = atomic_fetchadd_int(&numfreebuffers, 1);
- KASSERT(old >= 0 && old < nbuf,
- ("numfreebuffers climbed to %d", old + 1));
- mask = VFS_BIO_NEED_ANY;
- if (numfreebuffers >= hifreebuffers)
- mask |= VFS_BIO_NEED_FREE;
- rw_rlock(&nblock);
- for (;;) {
- need_wakeup = 0;
- on = needsbuffer;
- if (on == 0)
- break;
- need_wakeup = 1;
- if (atomic_cmpset_rel_int(&needsbuffer, on, on & ~mask))
- break;
- }
- if (need_wakeup)
- wakeup(__DEVOLATILE(void *, &needsbuffer));
- rw_runlock(&nblock);
-}
-
-/*
- * bufcountsub:
- *
- * Decrement the numfreebuffers count as needed.
- */
-static void
-bufcountsub(struct buf *bp)
-{
- int old;
-
- /*
- * Fixup numfreebuffers count. If the buffer is invalid or not
- * delayed-write, the buffer was free and we must decrement
- * numfreebuffers.
- */
- if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) {
- KASSERT((bp->b_flags & B_INFREECNT) != 0,
- ("buf %p not counted in numfreebuffers", bp));
- bp->b_flags &= ~B_INFREECNT;
- old = atomic_fetchadd_int(&numfreebuffers, -1);
- KASSERT(old > 0, ("numfreebuffers dropped to %d", old - 1));
- }
-}
-
-/*
* waitrunningbufspace()
*
* runningbufspace is a measure of the amount of I/O currently
@@ -847,8 +1011,10 @@ bufinit(void)
int i;
CTASSERT(MAXBCACHEBUF >= MAXBSIZE);
- mtx_init(&bqclean, "bufq clean lock", NULL, MTX_DEF);
- mtx_init(&bqdirty, "bufq dirty lock", NULL, MTX_DEF);
+ mtx_init(&bqlocks[QUEUE_DIRTY], "bufq dirty lock", NULL, MTX_DEF);
+ mtx_init(&bqlocks[QUEUE_EMPTY], "bufq empty lock", NULL, MTX_DEF);
+ for (i = QUEUE_CLEAN; i < QUEUE_CLEAN + CLEAN_QUEUES; i++)
+ mtx_init(&bqlocks[i], "bufq clean lock", NULL, MTX_DEF);
mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF);
rw_init(&nblock, "needsbuffer lock");
mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF);
@@ -864,7 +1030,7 @@ bufinit(void)
for (i = 0; i < nbuf; i++) {
bp = &buf[i];
bzero(bp, sizeof *bp);
- bp->b_flags = B_INVAL | B_INFREECNT;
+ bp->b_flags = B_INVAL;
bp->b_rcred = NOCRED;
bp->b_wcred = NOCRED;
bp->b_qindex = QUEUE_EMPTY;
@@ -881,18 +1047,19 @@ bufinit(void)
/*
* maxbufspace is the absolute maximum amount of buffer space we are
* allowed to reserve in KVM and in real terms. The absolute maximum
- * is nominally used by buf_daemon. hibufspace is the nominal maximum
- * used by most other processes. The differential is required to
- * ensure that buf_daemon is able to run when other processes might
- * be blocked waiting for buffer space.
+ * is nominally used by metadata. hibufspace is the nominal maximum
+ * used by most other requests. The differential is required to
+ * ensure that metadata deadlocks don't occur.
*
* maxbufspace is based on BKVASIZE. Allocating buffers larger then
* this may result in KVM fragmentation which is not handled optimally
- * by the system.
+ * by the system. XXX This is less true with vmem. We could use
+ * PAGE_SIZE.
*/
maxbufspace = (long)nbuf * BKVASIZE;
hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - MAXBCACHEBUF * 10);
- lobufspace = hibufspace - MAXBCACHEBUF;
+ lobufspace = (hibufspace / 20) * 19; /* 95% */
+ bufspacethresh = lobufspace + (hibufspace - lobufspace) / 2;
/*
* Note: The 16 MiB upper limit for hirunningspace was chosen
@@ -906,44 +1073,61 @@ bufinit(void)
16 * 1024 * 1024), 1024 * 1024);
lorunningspace = roundup((hirunningspace * 2) / 3, MAXBCACHEBUF);
-/*
- * Limit the amount of malloc memory since it is wired permanently into
- * the kernel space. Even though this is accounted for in the buffer
- * allocation, we don't want the malloced region to grow uncontrolled.
- * The malloc scheme improves memory utilization significantly on average
- * (small) directories.
- */
+ /*
+ * Limit the amount of malloc memory since it is wired permanently into
+ * the kernel space. Even though this is accounted for in the buffer
+ * allocation, we don't want the malloced region to grow uncontrolled.
+ * The malloc scheme improves memory utilization significantly on
+ * average (small) directories.
+ */
maxbufmallocspace = hibufspace / 20;
-/*
- * Reduce the chance of a deadlock occuring by limiting the number
- * of delayed-write dirty buffers we allow to stack up.
- */
+ /*
+ * Reduce the chance of a deadlock occuring by limiting the number
+ * of delayed-write dirty buffers we allow to stack up.
+ */
hidirtybuffers = nbuf / 4 + 20;
dirtybufthresh = hidirtybuffers * 9 / 10;
numdirtybuffers = 0;
-/*
- * To support extreme low-memory systems, make sure hidirtybuffers cannot
- * eat up all available buffer space. This occurs when our minimum cannot
- * be met. We try to size hidirtybuffers to 3/4 our buffer space assuming
- * BKVASIZE'd buffers.
- */
+ /*
+ * To support extreme low-memory systems, make sure hidirtybuffers
+ * cannot eat up all available buffer space. This occurs when our
+ * minimum cannot be met. We try to size hidirtybuffers to 3/4 our
+ * buffer space assuming BKVASIZE'd buffers.
+ */
while ((long)hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) {
hidirtybuffers >>= 1;
}
lodirtybuffers = hidirtybuffers / 2;
-/*
- * Try to keep the number of free buffers in the specified range,
- * and give special processes (e.g. like buf_daemon) access to an
- * emergency reserve.
- */
- lofreebuffers = nbuf / 18 + 5;
- hifreebuffers = 2 * lofreebuffers;
+ /*
+ * lofreebuffers should be sufficient to avoid stalling waiting on
+ * buf headers under heavy utilization. The bufs in per-cpu caches
+ * are counted as free but will be unavailable to threads executing
+ * on other cpus.
+ *
+ * hifreebuffers is the free target for the bufspace daemon. This
+ * should be set appropriately to limit work per-iteration.
+ */
+ lofreebuffers = MIN((nbuf / 25) + (20 * mp_ncpus), 128 * mp_ncpus);
+ hifreebuffers = (3 * lofreebuffers) / 2;
numfreebuffers = nbuf;
bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ |
VM_ALLOC_NORMAL | VM_ALLOC_WIRED);
+
+ /* Setup the kva and free list allocators. */
+ vmem_set_reclaim(buffer_arena, bufkva_reclaim);
+ buf_zone = uma_zcache_create("buf free cache", sizeof(struct buf),
+ NULL, NULL, NULL, NULL, buf_import, buf_release, NULL, 0);
+
+ /*
+ * Size the clean queue according to the amount of buffer space.
+ * One queue per-256mb up to the max. More queues gives better
+ * concurrency but less accurate LRU.
+ */
+ clean_queues = MIN(howmany(maxbufspace, 256*1024*1024), CLEAN_QUEUES);
+
}
#ifdef INVARIANTS
@@ -1129,10 +1313,25 @@ binsfree(struct buf *bp, int qindex)
{
struct mtx *olock, *nlock;
- BUF_ASSERT_XLOCKED(bp);
+ if (qindex != QUEUE_EMPTY) {
+ BUF_ASSERT_XLOCKED(bp);
+ }
+ /*
+ * Stick to the same clean queue for the lifetime of the buf to
+ * limit locking below. Otherwise pick ont sequentially.
+ */
+ if (qindex == QUEUE_CLEAN) {
+ if (bqisclean(bp->b_qindex))
+ qindex = bp->b_qindex;
+ else
+ qindex = bqcleanq();
+ }
+
+ /*
+ * Handle delayed bremfree() processing.
+ */
nlock = bqlock(qindex);
- /* Handle delayed bremfree() processing. */
if (bp->b_flags & B_REMFREE) {
olock = bqlock(bp->b_qindex);
mtx_lock(olock);
@@ -1156,15 +1355,263 @@ binsfree(struct buf *bp, int qindex)
bq_len[bp->b_qindex]++;
#endif
mtx_unlock(nlock);
+}
+
+/*
+ * buf_free:
+ *
+ * Free a buffer to the buf zone once it no longer has valid contents.
+ */
+static void
+buf_free(struct buf *bp)
+{
+
+ if (bp->b_flags & B_REMFREE)
+ bremfreef(bp);
+ if (bp->b_vflags & BV_BKGRDINPROG)
+ panic("losing buffer 1");
+ if (bp->b_rcred != NOCRED) {
+ crfree(bp->b_rcred);
+ bp->b_rcred = NOCRED;
+ }
+ if (bp->b_wcred != NOCRED) {
+ crfree(bp->b_wcred);
+ bp->b_wcred = NOCRED;
+ }
+ if (!LIST_EMPTY(&bp->b_dep))
+ buf_deallocate(bp);
+ bufkva_free(bp);
+ BUF_UNLOCK(bp);
+ uma_zfree(buf_zone, bp);
+ atomic_add_int(&numfreebuffers, 1);
+ bufspace_wakeup();
+}
+
+/*
+ * buf_import:
+ *
+ * Import bufs into the uma cache from the buf list. The system still
+ * expects a static array of bufs and much of the synchronization
+ * around bufs assumes type stable storage. As a result, UMA is used
+ * only as a per-cpu cache of bufs still maintained on a global list.
+ */
+static int
+buf_import(void *arg, void **store, int cnt, int flags)
+{
+ struct buf *bp;
+ int i;
+
+ mtx_lock(&bqlocks[QUEUE_EMPTY]);
+ for (i = 0; i < cnt; i++) {
+ bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
+ if (bp == NULL)
+ break;
+ bremfreel(bp);
+ store[i] = bp;
+ }
+ mtx_unlock(&bqlocks[QUEUE_EMPTY]);
+
+ return (i);
+}
+
+/*
+ * buf_release:
+ *
+ * Release bufs from the uma cache back to the buffer queues.
+ */
+static void
+buf_release(void *arg, void **store, int cnt)
+{
+ int i;
+
+ for (i = 0; i < cnt; i++)
+ binsfree(store[i], QUEUE_EMPTY);
+}
+
+/*
+ * buf_alloc:
+ *
+ * Allocate an empty buffer header.
+ */
+static struct buf *
+buf_alloc(void)
+{
+ struct buf *bp;
+
+ bp = uma_zalloc(buf_zone, M_NOWAIT);
+ if (bp == NULL) {
+ bufspace_daemonwakeup();
+ atomic_add_int(&numbufallocfails, 1);
+ return (NULL);
+ }
+
+ /*
+ * Wake-up the bufspace daemon on transition.
+ */
+ if (atomic_fetchadd_int(&numfreebuffers, -1) == lofreebuffers)
+ bufspace_daemonwakeup();
+
+ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
+ panic("getnewbuf_empty: Locked buf %p on free queue.", bp);
+
+ KASSERT(bp->b_vp == NULL,
+ ("bp: %p still has vnode %p.", bp, bp->b_vp));
+ KASSERT((bp->b_flags & (B_DELWRI | B_NOREUSE)) == 0,
+ ("invalid buffer %p flags %#x", bp, bp->b_flags));
+ KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0,
+ ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags));
+ KASSERT(bp->b_npages == 0,
+ ("bp: %p still has %d vm pages\n", bp, bp->b_npages));
+ KASSERT(bp->b_kvasize == 0, ("bp: %p still has kva\n", bp));
+ KASSERT(bp->b_bufsize == 0, ("bp: %p still has bufspace\n", bp));
+
+ bp->b_flags = 0;
+ bp->b_ioflags = 0;
+ bp->b_xflags = 0;
+ bp->b_vflags = 0;
+ bp->b_vp = NULL;
+ bp->b_blkno = bp->b_lblkno = 0;
+ bp->b_offset = NOOFFSET;
+ bp->b_iodone = 0;
+ bp->b_error = 0;
+ bp->b_resid = 0;
+ bp->b_bcount = 0;
+ bp->b_npages = 0;
+ bp->b_dirtyoff = bp->b_dirtyend = 0;
+ bp->b_bufobj = NULL;
+ bp->b_pin_count = 0;
+ bp->b_data = bp->b_kvabase = unmapped_buf;
+ bp->b_fsprivate1 = NULL;
+ bp->b_fsprivate2 = NULL;
+ bp->b_fsprivate3 = NULL;
+ LIST_INIT(&bp->b_dep);
+
+ return (bp);
+}
+
+/*
+ * buf_qrecycle:
+ *
+ * Free a buffer from the given bufqueue. kva controls whether the
+ * freed buf must own some kva resources. This is used for
+ * defragmenting.
+ */
+static int
+buf_qrecycle(int qindex, bool kva)
+{
+ struct buf *bp, *nbp;
+
+ if (kva)
+ atomic_add_int(&bufdefragcnt, 1);
+ nbp = NULL;
+ mtx_lock(&bqlocks[qindex]);
+ nbp = TAILQ_FIRST(&bufqueues[qindex]);
/*
- * Something we can maybe free or reuse.
+ * Run scan, possibly freeing data and/or kva mappings on the fly
+ * depending.
*/
- if (bp->b_bufsize && !(bp->b_flags & B_DELWRI))
- bufspacewakeup();
+ while ((bp = nbp) != NULL) {
+ /*
+ * Calculate next bp (we can only use it if we do not
+ * release the bqlock).
+ */
+ nbp = TAILQ_NEXT(bp, b_freelist);
+
+ /*
+ * If we are defragging then we need a buffer with
+ * some kva to reclaim.
+ */
+ if (kva && bp->b_kvasize == 0)
+ continue;
+
+ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
+ continue;
+
+ /*
+ * Skip buffers with background writes in progress.
+ */
+ if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
+ BUF_UNLOCK(bp);
+ continue;
+ }
+
+ KASSERT(bp->b_qindex == qindex,
+ ("getnewbuf: inconsistent queue %d bp %p", qindex, bp));
+ /*
+ * NOTE: nbp is now entirely invalid. We can only restart
+ * the scan from this point on.
+ */
+ bremfreel(bp);
+ mtx_unlock(&bqlocks[qindex]);
+
+ /*
+ * Requeue the background write buffer with error and
+ * restart the scan.
+ */
+ if ((bp->b_vflags & BV_BKGRDERR) != 0) {
+ bqrelse(bp);
+ mtx_lock(&bqlocks[qindex]);
+ nbp = TAILQ_FIRST(&bufqueues[qindex]);
+ continue;
+ }
+ bp->b_flags |= B_INVAL;
+ brelse(bp);
+ return (0);
+ }
+ mtx_unlock(&bqlocks[qindex]);
+
+ return (ENOBUFS);
+}
- if ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))
- bufcountadd(bp);
+/*
+ * buf_recycle:
+ *
+ * Iterate through all clean queues until we find a buf to recycle or
+ * exhaust the search.
+ */
+static int
+buf_recycle(bool kva)
+{
+ int qindex, first_qindex;
+
+ qindex = first_qindex = bqcleanq();
+ do {
+ if (buf_qrecycle(qindex, kva) == 0)
+ return (0);
+ if (++qindex == QUEUE_CLEAN + clean_queues)
+ qindex = QUEUE_CLEAN;
+ } while (qindex != first_qindex);
+
+ return (ENOBUFS);
+}
+
+/*
+ * buf_scan:
+ *
+ * Scan the clean queues looking for a buffer to recycle. needsbuffer
+ * is set on failure so that the caller may optionally bufspace_wait()
+ * in a race-free fashion.
+ */
+static int
+buf_scan(bool defrag)
+{
+ int error;
+
+ /*
+ * To avoid heavy synchronization and wakeup races we set
+ * needsbuffer and re-poll before failing. This ensures that
+ * no frees can be missed between an unsuccessful poll and
+ * going to sleep in a synchronized fashion.
+ */
+ if ((error = buf_recycle(defrag)) != 0) {
+ atomic_set_int(&needsbuffer, 1);
+ bufspace_daemonwakeup();
+ error = buf_recycle(defrag);
+ }
+ if (error == 0)
+ atomic_add_int(&getnewbufrestarts, 1);
+ return (error);
}
/*
@@ -1185,7 +1632,6 @@ bremfree(struct buf *bp)
BUF_ASSERT_XLOCKED(bp);
bp->b_flags |= B_REMFREE;
- bufcountsub(bp);
}
/*
@@ -1219,7 +1665,9 @@ bremfreel(struct buf *bp)
bp, bp->b_vp, bp->b_flags);
KASSERT(bp->b_qindex != QUEUE_NONE,
("bremfreel: buffer %p not on a queue.", bp));
- BUF_ASSERT_XLOCKED(bp);
+ if (bp->b_qindex != QUEUE_EMPTY) {
+ BUF_ASSERT_XLOCKED(bp);
+ }
mtx_assert(bqlock(bp->b_qindex), MA_OWNED);
TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
@@ -1229,25 +1677,17 @@ bremfreel(struct buf *bp)
bq_len[bp->b_qindex]--;
#endif
bp->b_qindex = QUEUE_NONE;
- /*
- * If this was a delayed bremfree() we only need to remove the buffer
- * from the queue and return the stats are already done.
- */
- if (bp->b_flags & B_REMFREE) {
- bp->b_flags &= ~B_REMFREE;
- return;
- }
- bufcountsub(bp);
+ bp->b_flags &= ~B_REMFREE;
}
/*
- * bufkvafree:
+ * bufkva_free:
*
* Free the kva allocation for a buffer.
*
*/
static void
-bufkvafree(struct buf *bp)
+bufkva_free(struct buf *bp)
{
#ifdef INVARIANTS
@@ -1271,12 +1711,12 @@ bufkvafree(struct buf *bp)
}
/*
- * bufkvaalloc:
+ * bufkva_alloc:
*
* Allocate the buffer KVA and set b_kvasize and b_kvabase.
*/
static int
-bufkvaalloc(struct buf *bp, int maxsize, int gbflags)
+bufkva_alloc(struct buf *bp, int maxsize, int gbflags)
{
vm_offset_t addr;
int error;
@@ -1284,7 +1724,7 @@ bufkvaalloc(struct buf *bp, int maxsize, int gbflags)
KASSERT((gbflags & GB_UNMAPPED) == 0 || (gbflags & GB_KVAALLOC) != 0,
("Invalid gbflags 0x%x in %s", gbflags, __func__));
- bufkvafree(bp);
+ bufkva_free(bp);
addr = 0;
error = vmem_alloc(buffer_arena, maxsize, M_BESTFIT | M_NOWAIT, &addr);
@@ -1293,7 +1733,6 @@ bufkvaalloc(struct buf *bp, int maxsize, int gbflags)
* Buffer map is too fragmented. Request the caller
* to defragment the map.
*/
- atomic_add_int(&bufdefragcnt, 1);
return (error);
}
bp->b_kvabase = (caddr_t)addr;
@@ -1310,6 +1749,24 @@ bufkvaalloc(struct buf *bp, int maxsize, int gbflags)
}
/*
+ * bufkva_reclaim:
+ *
+ * Reclaim buffer kva by freeing buffers holding kva. This is a vmem
+ * callback that fires to avoid returning failure.
+ */
+static void
+bufkva_reclaim(vmem_t *vmem, int flags)
+{
+ int i;
+
+ for (i = 0; i < 5; i++)
+ if (buf_scan(true) != 0)
+ break;
+ return;
+}
+
+
+/*
* Attempt to initiate asynchronous I/O on read-ahead blocks. We must
* clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set,
* the buffer is valid and we do not have to do anything.
@@ -1900,14 +2357,11 @@ brelse(struct buf *bp)
/* buffers with no memory */
if (bp->b_bufsize == 0) {
- bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA);
- if (bp->b_vflags & BV_BKGRDINPROG)
- panic("losing buffer 1");
- bufkvafree(bp);
- qindex = QUEUE_EMPTY;
- bp->b_flags |= B_AGE;
+ buf_free(bp);
+ return;
+ }
/* buffers with junk contents */
- } else if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) ||
+ if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) ||
(bp->b_ioflags & BIO_ERROR)) {
bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA);
if (bp->b_vflags & BV_BKGRDINPROG)
@@ -1927,6 +2381,8 @@ brelse(struct buf *bp)
panic("brelse: not dirty");
/* unlock */
BUF_UNLOCK(bp);
+ if (qindex == QUEUE_CLEAN)
+ bufspace_wakeup();
}
/*
@@ -1949,6 +2405,7 @@ bqrelse(struct buf *bp)
KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
+ qindex = QUEUE_NONE;
if (BUF_LOCKRECURSED(bp)) {
/* do not release to free list */
BUF_UNLOCK(bp);
@@ -1984,6 +2441,8 @@ bqrelse(struct buf *bp)
out:
/* unlock */
BUF_UNLOCK(bp);
+ if (qindex == QUEUE_CLEAN)
+ bufspace_wakeup();
}
/*
@@ -2383,297 +2842,26 @@ vfs_bio_awrite(struct buf *bp)
}
/*
- * Ask the bufdaemon for help, or act as bufdaemon itself, when a
- * locked vnode is supplied.
+ * getnewbuf_kva:
+ *
+ * Allocate KVA for an empty buf header according to gbflags.
*/
-static void
-getnewbuf_bufd_help(struct vnode *vp, int gbflags, int slpflag, int slptimeo,
- int defrag)
-{
- struct thread *td;
- char *waitmsg;
- int error, fl, flags, norunbuf;
-
- mtx_assert(&bqclean, MA_OWNED);
-
- if (defrag) {
- flags = VFS_BIO_NEED_BUFSPACE;
- waitmsg = "nbufkv";
- } else if (bufspace >= hibufspace) {
- waitmsg = "nbufbs";
- flags = VFS_BIO_NEED_BUFSPACE;
- } else {
- waitmsg = "newbuf";
- flags = VFS_BIO_NEED_ANY;
- }
- atomic_set_int(&needsbuffer, flags);
- mtx_unlock(&bqclean);
-
- bd_speedup(); /* heeeelp */
- if ((gbflags & GB_NOWAIT_BD) != 0)
- return;
-
- td = curthread;
- rw_wlock(&nblock);
- while ((needsbuffer & flags) != 0) {
- if (vp != NULL && vp->v_type != VCHR &&
- (td->td_pflags & TDP_BUFNEED) == 0) {
- rw_wunlock(&nblock);
- /*
- * getblk() is called with a vnode locked, and
- * some majority of the dirty buffers may as
- * well belong to the vnode. Flushing the
- * buffers there would make a progress that
- * cannot be achieved by the buf_daemon, that
- * cannot lock the vnode.
- */
- norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) |
- (td->td_pflags & TDP_NORUNNINGBUF);
-
- /*
- * Play bufdaemon. The getnewbuf() function
- * may be called while the thread owns lock
- * for another dirty buffer for the same
- * vnode, which makes it impossible to use
- * VOP_FSYNC() there, due to the buffer lock
- * recursion.
- */
- td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF;
- fl = buf_flush(vp, flushbufqtarget);
- td->td_pflags &= norunbuf;
- rw_wlock(&nblock);
- if (fl != 0)
- continue;
- if ((needsbuffer & flags) == 0)
- break;
- }
- error = rw_sleep(__DEVOLATILE(void *, &needsbuffer), &nblock,
- (PRIBIO + 4) | slpflag, waitmsg, slptimeo);
- if (error != 0)
- break;
- }
- rw_wunlock(&nblock);
-}
-
-static void
-getnewbuf_reuse_bp(struct buf *bp, int qindex)
-{
-
- CTR6(KTR_BUF, "getnewbuf(%p) vp %p flags %X kvasize %d bufsize %d "
- "queue %d (recycling)", bp, bp->b_vp, bp->b_flags,
- bp->b_kvasize, bp->b_bufsize, qindex);
- mtx_assert(&bqclean, MA_NOTOWNED);
-
- /*
- * Note: we no longer distinguish between VMIO and non-VMIO
- * buffers.
- */
- KASSERT((bp->b_flags & (B_DELWRI | B_NOREUSE)) == 0,
- ("invalid buffer %p flags %#x found in queue %d", bp, bp->b_flags,
- qindex));
-
- /*
- * When recycling a clean buffer we have to truncate it and
- * release the vnode.
- */
- if (qindex == QUEUE_CLEAN) {
- allocbuf(bp, 0);
- if (bp->b_vp != NULL)
- brelvp(bp);
- }
-
- /*
- * Get the rest of the buffer freed up. b_kva* is still valid
- * after this operation.
- */
- if (bp->b_rcred != NOCRED) {
- crfree(bp->b_rcred);
- bp->b_rcred = NOCRED;
- }
- if (bp->b_wcred != NOCRED) {
- crfree(bp->b_wcred);
- bp->b_wcred = NOCRED;
- }
- if (!LIST_EMPTY(&bp->b_dep))
- buf_deallocate(bp);
- if (bp->b_vflags & BV_BKGRDINPROG)
- panic("losing buffer 3");
- KASSERT(bp->b_vp == NULL, ("bp: %p still has vnode %p. qindex: %d",
- bp, bp->b_vp, qindex));
- KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0,
- ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags));
- KASSERT(bp->b_npages == 0,
- ("bp: %p still has %d vm pages\n", bp, bp->b_npages));
-
- bp->b_flags = 0;
- bp->b_ioflags = 0;
- bp->b_xflags = 0;
- KASSERT((bp->b_flags & B_INFREECNT) == 0,
- ("buf %p still counted as free?", bp));
- bp->b_vflags = 0;
- bp->b_vp = NULL;
- bp->b_blkno = bp->b_lblkno = 0;
- bp->b_offset = NOOFFSET;
- bp->b_iodone = 0;
- bp->b_error = 0;
- bp->b_resid = 0;
- bp->b_bcount = 0;
- bp->b_npages = 0;
- bp->b_dirtyoff = bp->b_dirtyend = 0;
- bp->b_bufobj = NULL;
- bp->b_pin_count = 0;
- bp->b_data = bp->b_kvabase;
- bp->b_fsprivate1 = NULL;
- bp->b_fsprivate2 = NULL;
- bp->b_fsprivate3 = NULL;
-
- LIST_INIT(&bp->b_dep);
-}
-
-static struct buf *
-getnewbuf_scan(int maxsize, int defrag, int unmapped, int metadata)
+static int
+getnewbuf_kva(struct buf *bp, int gbflags, int maxsize)
{
- struct buf *bp, *nbp;
- int nqindex, qindex, pass;
-
- KASSERT(!unmapped || !defrag, ("both unmapped and defrag"));
-
- pass = 0;
-restart:
- if (pass != 0)
- atomic_add_int(&getnewbufrestarts, 1);
-
- nbp = NULL;
- mtx_lock(&bqclean);
- /*
- * If we're not defragging or low on bufspace attempt to make a new
- * buf from a header.
- */
- if (defrag == 0 && bufspace + maxsize < hibufspace) {
- nqindex = QUEUE_EMPTY;
- nbp = TAILQ_FIRST(&bufqueues[nqindex]);
- }
- /*
- * All available buffers might be clean or we need to start recycling.
- */
- if (nbp == NULL) {
- nqindex = QUEUE_CLEAN;
- nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
- }
-
- /*
- * Run scan, possibly freeing data and/or kva mappings on the fly
- * depending.
- */
- while ((bp = nbp) != NULL) {
- qindex = nqindex;
-
- /*
- * Calculate next bp (we can only use it if we do not
- * release the bqlock)
- */
- if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) {
- switch (qindex) {
- case QUEUE_EMPTY:
- nqindex = QUEUE_CLEAN;
- nbp = TAILQ_FIRST(&bufqueues[nqindex]);
- if (nbp != NULL)
- break;
- /* FALLTHROUGH */
- case QUEUE_CLEAN:
- if (metadata && pass == 0) {
- pass = 1;
- nqindex = QUEUE_EMPTY;
- nbp = TAILQ_FIRST(&bufqueues[nqindex]);
- }
- /*
- * nbp is NULL.
- */
- break;
- }
- }
- /*
- * If we are defragging then we need a buffer with
- * b_kvasize != 0. This situation occurs when we
- * have many unmapped bufs.
- */
- if (defrag && bp->b_kvasize == 0)
- continue;
-
- /*
- * Start freeing the bp. This is somewhat involved. nbp
- * remains valid only for QUEUE_EMPTY[KVA] bp's.
- */
- if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
- continue;
- /*
- * BKGRDINPROG can only be set with the buf and bufobj
- * locks both held. We tolerate a race to clear it here.
- */
- if (bp->b_vflags & BV_BKGRDINPROG) {
- BUF_UNLOCK(bp);
- continue;
- }
-
- /*
- * Requeue the background write buffer with error.
- */
- if ((bp->b_vflags & BV_BKGRDERR) != 0) {
- bremfreel(bp);
- mtx_unlock(&bqclean);
- bqrelse(bp);
- continue;
- }
-
- KASSERT(bp->b_qindex == qindex,
- ("getnewbuf: inconsistent queue %d bp %p", qindex, bp));
-
- bremfreel(bp);
- mtx_unlock(&bqclean);
-
- /*
- * NOTE: nbp is now entirely invalid. We can only restart
- * the scan from this point on.
- */
- getnewbuf_reuse_bp(bp, qindex);
- mtx_assert(&bqclean, MA_NOTOWNED);
-
- /*
- * If we are defragging then free the buffer.
- */
- if (defrag) {
- bp->b_flags |= B_INVAL;
- brelse(bp);
- defrag = 0;
- goto restart;
- }
+ if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_UNMAPPED) {
/*
- * Notify any waiters for the buffer lock about
- * identity change by freeing the buffer.
+ * In order to keep fragmentation sane we only allocate kva
+ * in BKVASIZE chunks. XXX with vmem we can do page size.
*/
- if (qindex == QUEUE_CLEAN && BUF_LOCKWAITERS(bp)) {
- bp->b_flags |= B_INVAL;
- brelse(bp);
- goto restart;
- }
-
- if (metadata)
- break;
+ maxsize = (maxsize + BKVAMASK) & ~BKVAMASK;
- /*
- * If we are overcomitted then recover the buffer and its
- * KVM space. This occurs in rare situations when multiple
- * processes are blocked in getnewbuf() or allocbuf().
- */
- if (bufspace >= hibufspace && bp->b_kvasize != 0) {
- bp->b_flags |= B_INVAL;
- brelse(bp);
- goto restart;
- }
- break;
+ if (maxsize != bp->b_kvasize &&
+ bufkva_alloc(bp, maxsize, gbflags))
+ return (ENOSPC);
}
- return (bp);
+ return (0);
}
/*
@@ -2682,86 +2870,54 @@ restart:
* Find and initialize a new buffer header, freeing up existing buffers
* in the bufqueues as necessary. The new buffer is returned locked.
*
- * Important: B_INVAL is not set. If the caller wishes to throw the
- * buffer away, the caller must set B_INVAL prior to calling brelse().
- *
* We block if:
* We have insufficient buffer headers
* We have insufficient buffer space
* buffer_arena is too fragmented ( space reservation fails )
* If we have to flush dirty buffers ( but we try to avoid this )
+ *
+ * The caller is responsible for releasing the reserved bufspace after
+ * allocbuf() is called.
*/
static struct buf *
-getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int size, int maxsize,
- int gbflags)
+getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int maxsize, int gbflags)
{
struct buf *bp;
- int defrag, metadata;
+ bool metadata, reserved;
KASSERT((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC,
("GB_KVAALLOC only makes sense with GB_UNMAPPED"));
if (!unmapped_buf_allowed)
gbflags &= ~(GB_UNMAPPED | GB_KVAALLOC);
- defrag = 0;
if (vp == NULL || (vp->v_vflag & (VV_MD | VV_SYSTEM)) != 0 ||
vp->v_type == VCHR)
- metadata = 1;
+ metadata = true;
else
- metadata = 0;
- /*
- * We can't afford to block since we might be holding a vnode lock,
- * which may prevent system daemons from running. We deal with
- * low-memory situations by proactively returning memory and running
- * async I/O rather then sync I/O.
- */
+ metadata = false;
atomic_add_int(&getnewbufcalls, 1);
-restart:
- bp = getnewbuf_scan(maxsize, defrag, (gbflags & (GB_UNMAPPED |
- GB_KVAALLOC)) == GB_UNMAPPED, metadata);
- if (bp != NULL)
- defrag = 0;
-
- /*
- * If we exhausted our list, sleep as appropriate. We may have to
- * wakeup various daemons and write out some dirty buffers.
- *
- * Generally we are sleeping due to insufficient buffer space.
- */
- if (bp == NULL) {
- mtx_assert(&bqclean, MA_OWNED);
- getnewbuf_bufd_help(vp, gbflags, slpflag, slptimeo, defrag);
- mtx_assert(&bqclean, MA_NOTOWNED);
- } else if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == GB_UNMAPPED) {
- mtx_assert(&bqclean, MA_NOTOWNED);
-
- bufkvafree(bp);
- atomic_add_int(&bufreusecnt, 1);
- } else {
- mtx_assert(&bqclean, MA_NOTOWNED);
-
- /*
- * We finally have a valid bp. We aren't quite out of the
- * woods, we still have to reserve kva space. In order to
- * keep fragmentation sane we only allocate kva in BKVASIZE
- * chunks.
- */
- maxsize = (maxsize + BKVAMASK) & ~BKVAMASK;
+ reserved = false;
+ do {
+ if (reserved == false &&
+ bufspace_reserve(maxsize, metadata) != 0)
+ continue;
+ reserved = true;
+ if ((bp = buf_alloc()) == NULL)
+ continue;
+ if (getnewbuf_kva(bp, gbflags, maxsize) == 0)
+ return (bp);
+ break;
+ } while(buf_scan(false) == 0);
- if (maxsize != bp->b_kvasize &&
- bufkvaalloc(bp, maxsize, gbflags)) {
- defrag = 1;
- bp->b_flags |= B_INVAL;
- brelse(bp);
- goto restart;
- } else if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) ==
- (GB_UNMAPPED | GB_KVAALLOC)) {
- bp->b_data = unmapped_buf;
- BUF_CHECK_UNMAPPED(bp);
- }
- atomic_add_int(&bufreusecnt, 1);
+ if (reserved)
+ bufspace_release(maxsize);
+ if (bp != NULL) {
+ bp->b_flags |= B_INVAL;
+ brelse(bp);
}
- return (bp);
+ bufspace_wait(vp, gbflags, slpflag, slptimeo);
+
+ return (NULL);
}
/*
@@ -2771,7 +2927,6 @@ restart:
* update daemon but if it cannot keep up this process starts to
* take the load in an attempt to prevent getnewbuf() from blocking.
*/
-
static struct kproc_desc buf_kp = {
"bufdaemon",
buf_daemon,
@@ -2902,19 +3057,19 @@ flushbufqueues(struct vnode *lvp, int target, int flushdeps)
bp = NULL;
sentinel = malloc(sizeof(struct buf), M_TEMP, M_WAITOK | M_ZERO);
sentinel->b_qindex = QUEUE_SENTINEL;
- mtx_lock(&bqdirty);
+ mtx_lock(&bqlocks[queue]);
TAILQ_INSERT_HEAD(&bufqueues[queue], sentinel, b_freelist);
- mtx_unlock(&bqdirty);
+ mtx_unlock(&bqlocks[queue]);
while (flushed != target) {
maybe_yield();
- mtx_lock(&bqdirty);
+ mtx_lock(&bqlocks[queue]);
bp = TAILQ_NEXT(sentinel, b_freelist);
if (bp != NULL) {
TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist);
TAILQ_INSERT_AFTER(&bufqueues[queue], bp, sentinel,
b_freelist);
} else {
- mtx_unlock(&bqdirty);
+ mtx_unlock(&bqlocks[queue]);
break;
}
/*
@@ -2926,11 +3081,11 @@ flushbufqueues(struct vnode *lvp, int target, int flushdeps)
*/
if (bp->b_qindex == QUEUE_SENTINEL || (lvp != NULL &&
bp->b_vp != lvp)) {
- mtx_unlock(&bqdirty);
+ mtx_unlock(&bqlocks[queue]);
continue;
}
error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL);
- mtx_unlock(&bqdirty);
+ mtx_unlock(&bqlocks[queue]);
if (error != 0)
continue;
if (bp->b_pin_count > 0) {
@@ -3013,9 +3168,9 @@ flushbufqueues(struct vnode *lvp, int target, int flushdeps)
vn_finished_write(mp);
BUF_UNLOCK(bp);
}
- mtx_lock(&bqdirty);
+ mtx_lock(&bqlocks[queue]);
TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist);
- mtx_unlock(&bqdirty);
+ mtx_unlock(&bqlocks[queue]);
free(sentinel, M_TEMP);
return (flushed);
}
@@ -3196,7 +3351,6 @@ vfs_setdirty_locked_object(struct buf *bp)
static void
bp_unmapped_get_kva(struct buf *bp, daddr_t blkno, int size, int gbflags)
{
- struct buf *scratch_bp;
int bsize, maxsize, need_mapping, need_kva;
off_t offset;
@@ -3229,37 +3383,16 @@ bp_unmapped_get_kva(struct buf *bp, daddr_t blkno, int size, int gbflags)
maxsize = size + (offset & PAGE_MASK);
maxsize = imax(maxsize, bsize);
-mapping_loop:
- if (bufkvaalloc(bp, maxsize, gbflags)) {
- /*
- * Request defragmentation. getnewbuf() returns us the
- * allocated space by the scratch buffer KVA.
- */
- scratch_bp = getnewbuf(bp->b_vp, 0, 0, size, maxsize, gbflags |
- (GB_UNMAPPED | GB_KVAALLOC));
- if (scratch_bp == NULL) {
- if ((gbflags & GB_NOWAIT_BD) != 0) {
- /*
- * XXXKIB: defragmentation cannot
- * succeed, not sure what else to do.
- */
- panic("GB_NOWAIT_BD and GB_UNMAPPED %p", bp);
- }
- atomic_add_int(&mappingrestarts, 1);
- goto mapping_loop;
+ while (bufkva_alloc(bp, maxsize, gbflags) != 0) {
+ if ((gbflags & GB_NOWAIT_BD) != 0) {
+ /*
+ * XXXKIB: defragmentation cannot
+ * succeed, not sure what else to do.
+ */
+ panic("GB_NOWAIT_BD and GB_UNMAPPED %p", bp);
}
- KASSERT(scratch_bp->b_kvabase != unmapped_buf,
- ("scratch bp has no KVA %p", scratch_bp));
- /* Grab pointers. */
- bp->b_kvabase = scratch_bp->b_kvabase;
- bp->b_kvasize = scratch_bp->b_kvasize;
- bp->b_data = scratch_bp->b_data;
-
- /* Get rid of the scratch buffer. */
- scratch_bp->b_kvasize = 0;
- scratch_bp->b_flags |= B_INVAL;
- scratch_bp->b_data = scratch_bp->b_kvabase = unmapped_buf;
- brelse(scratch_bp);
+ atomic_add_int(&mappingrestarts, 1);
+ bufspace_wait(bp->b_vp, gbflags, 0, 0);
}
has_addr:
if (need_mapping) {
@@ -3486,7 +3619,7 @@ loop:
}
maxsize = imax(maxsize, bsize);
- bp = getnewbuf(vp, slpflag, slptimeo, size, maxsize, flags);
+ bp = getnewbuf(vp, slpflag, slptimeo, maxsize, flags);
if (bp == NULL) {
if (slpflag || slptimeo)
return NULL;
@@ -3510,6 +3643,7 @@ loop:
BO_UNLOCK(bo);
bp->b_flags |= B_INVAL;
brelse(bp);
+ bufspace_release(maxsize);
goto loop;
}
@@ -3543,6 +3677,7 @@ loop:
}
allocbuf(bp, size);
+ bufspace_release(maxsize);
bp->b_flags &= ~B_DONE;
}
CTR4(KTR_BUF, "getblk(%p, %ld, %d) = %p", vp, (long)blkno, size, bp);
@@ -3564,12 +3699,13 @@ geteblk(int size, int flags)
int maxsize;
maxsize = (size + BKVAMASK) & ~BKVAMASK;
- while ((bp = getnewbuf(NULL, 0, 0, size, maxsize, flags)) == NULL) {
+ while ((bp = getnewbuf(NULL, 0, 0, maxsize, flags)) == NULL) {
if ((flags & GB_NOWAIT_BD) &&
(curthread->td_pflags & TDP_BUFNEED) != 0)
return (NULL);
}
allocbuf(bp, size);
+ bufspace_release(maxsize);
bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */
BUF_ASSERT_HELD(bp);
return (bp);
@@ -3595,7 +3731,7 @@ vfs_nonvmio_truncate(struct buf *bp, int newbsize)
return;
}
vm_hold_free_pages(bp, newbsize);
- bufspaceadjust(bp, newbsize);
+ bufspace_adjust(bp, newbsize);
}
/*
@@ -3646,7 +3782,7 @@ vfs_nonvmio_extend(struct buf *bp, int newbsize)
bcopy(origbuf, bp->b_data, origbufsize);
free(origbuf, M_BIOBUF);
}
- bufspaceadjust(bp, newbsize);
+ bufspace_adjust(bp, newbsize);
}
/*
@@ -3708,7 +3844,7 @@ allocbuf(struct buf *bp, int size)
/* XXX This looks as if it should be newbsize > b_bufsize */
else if (size > bp->b_bcount)
vfs_vmio_extend(bp, desiredpages, size);
- bufspaceadjust(bp, newbsize);
+ bufspace_adjust(bp, newbsize);
}
bp->b_bcount = size; /* requested buffer size. */
return (1);
@@ -4596,7 +4732,7 @@ DB_COMMAND(countfreebufs, db_coundfreebufs)
for (i = 0; i < nbuf; i++) {
bp = &buf[i];
- if ((bp->b_flags & B_INFREECNT) != 0)
+ if (bp->b_qindex == QUEUE_EMPTY)
nfree++;
else
used++;
OpenPOWER on IntegriCloud