summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authormarius <marius@FreeBSD.org>2016-02-23 01:09:35 +0000
committermarius <marius@FreeBSD.org>2016-02-23 01:09:35 +0000
commit257e82612c2ce6368f8fbbfa099f699c85f3a718 (patch)
treee9044537d588e919dc97dd74cfde80e0732bdd85
parent64f5438a3620a9b569443eb2d7968df9515df81e (diff)
downloadFreeBSD-src-257e82612c2ce6368f8fbbfa099f699c85f3a718.zip
FreeBSD-src-257e82612c2ce6368f8fbbfa099f699c85f3a718.tar.gz
In preparation for 10.3-RELEASE, temporarily revert the MFC of r291244
done as part of r292895 on stable/10 as that change causes hangs with ZFS and the cause on at least amd64 so far not understood. Discussed with: kib For further information see: https://lists.freebsd.org/pipermail/freebsd-stable/2016-February/084045.html PR: 207281 Approved by: re (gjb)
-rw-r--r--sys/kern/vfs_subr.c322
1 files changed, 80 insertions, 242 deletions
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index a721c5a..aa81313 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -145,51 +145,24 @@ int vttoif_tab[10] = {
static TAILQ_HEAD(freelst, vnode) vnode_free_list;
/*
- * "Free" vnode target. Free vnodes are rarely completely free, but are
- * just ones that are cheap to recycle. Usually they are for files which
- * have been stat'd but not read; these usually have inode and namecache
- * data attached to them. This target is the preferred minimum size of a
- * sub-cache consisting mostly of such files. The system balances the size
- * of this sub-cache with its complement to try to prevent either from
- * thrashing while the other is relatively inactive. The targets express
- * a preference for the best balance.
- *
- * "Above" this target there are 2 further targets (watermarks) related
- * to recyling of free vnodes. In the best-operating case, the cache is
- * exactly full, the free list has size between vlowat and vhiwat above the
- * free target, and recycling from it and normal use maintains this state.
- * Sometimes the free list is below vlowat or even empty, but this state
- * is even better for immediate use provided the cache is not full.
- * Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free
- * ones) to reach one of these states. The watermarks are currently hard-
- * coded as 4% and 9% of the available space higher. These and the default
- * of 25% for wantfreevnodes are too large if the memory size is large.
- * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim
- * whenever vnlru_proc() becomes active.
+ * Free vnode target. Free vnodes may simply be files which have been stat'd
+ * but not read. This is somewhat common, and a small cache of such files
+ * should be kept to avoid recreation costs.
*/
static u_long wantfreevnodes;
-SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW,
- &wantfreevnodes, 0, "Target for minimum number of \"free\" vnodes");
+SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
+/* Number of vnodes in the free list. */
static u_long freevnodes;
-SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD,
- &freevnodes, 0, "Number of \"free\" vnodes");
+SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0,
+ "Number of vnodes in the free list");
-/*
- * The vfs.vlru_allow_cache_src sysctl variable is no longer used but
- * the sysctl remains to provide ABI compatibility. The new code frees
- * namecache sources as the last chance to satisfy the highest watermark,
- * instead of selecting the source vnodes randomly. This provides good
- * enough behaviour to keep vn_fullpath() working in most situations.
- * The filesystem layout with deep trees, where the depricated knob was
- * required, is thus handled automatically.
- */
static int vlru_allow_cache_src;
SYSCTL_INT(_vfs, OID_AUTO, vlru_allow_cache_src, CTLFLAG_RW,
- &vlru_allow_cache_src, 0, "Placeholder for API compatibility (unused)");
+ &vlru_allow_cache_src, 0, "Allow vlru to reclaim source vnode");
static u_long recycles_count;
SYSCTL_ULONG(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count, 0,
- "Number of vnodes recycled to meet vnode cache targets");
+ "Number of vnodes recycled to avoid exceding kern.maxvnodes");
/*
* Various variables used for debugging the new implementation of
@@ -299,13 +272,14 @@ static int syncer_worklist_len;
static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
syncer_state;
-/* Target for maximum number of vnodes. */
+/*
+ * Number of vnodes we want to exist at any one time. This is mostly used
+ * to size hash tables in vnode-related code. It is normally not used in
+ * getnewvnode(), as wantfreevnodes is normally nonzero.)
+ *
+ * XXX desiredvnodes is historical cruft and should not exist.
+ */
int desiredvnodes;
-static int gapvnodes; /* gap between wanted and desired */
-static int vhiwat; /* enough extras after expansion */
-static int vlowat; /* minimal extras before expansion */
-static int vstir; /* nonzero to stir non-free vnodes */
-static volatile int vsmalltrigger = 8; /* pref to keep if > this many pages */
static int
sysctl_update_desiredvnodes(SYSCTL_HANDLER_ARGS)
@@ -316,8 +290,6 @@ sysctl_update_desiredvnodes(SYSCTL_HANDLER_ARGS)
if ((error = sysctl_handle_int(oidp, arg1, arg2, req)) != 0)
return (error);
if (old_desiredvnodes != desiredvnodes) {
- wantfreevnodes = desiredvnodes / 4;
- /* XXX locking seems to be incomplete. */
vfs_hash_changesize(desiredvnodes);
cache_changesize(desiredvnodes);
}
@@ -326,9 +298,9 @@ sysctl_update_desiredvnodes(SYSCTL_HANDLER_ARGS)
SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes,
CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, &desiredvnodes, 0,
- sysctl_update_desiredvnodes, "I", "Target for maximum number of vnodes");
+ sysctl_update_desiredvnodes, "I", "Maximum number of vnodes");
SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
- &wantfreevnodes, 0, "Old name for vfs.wantfreevnodes (legacy)");
+ &wantfreevnodes, 0, "Minimum number of vnodes (legacy)");
static int vnlru_nowhere;
SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
&vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
@@ -359,10 +331,10 @@ PCTRIE_DEFINE(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free);
*
* Reevaluate the following cap on the number of vnodes after the physical
* memory size exceeds 512GB. In the limit, as the physical memory size
- * grows, the ratio of the memory size in KB to to vnodes approaches 64:1.
+ * grows, the ratio of physical pages to vnodes approaches sixteen to one.
*/
#ifndef MAXVNODES_MAX
-#define MAXVNODES_MAX (512 * 1024 * 1024 / 64) /* 8M */
+#define MAXVNODES_MAX (512 * (1024 * 1024 * 1024 / (int)PAGE_SIZE / 16))
#endif
/*
@@ -433,16 +405,15 @@ vntblinit(void *dummy __unused)
/*
* Desiredvnodes is a function of the physical memory size and the
* kernel's heap size. Generally speaking, it scales with the
- * physical memory size. The ratio of desiredvnodes to the physical
- * memory size is 1:16 until desiredvnodes exceeds 98,304.
- * Thereafter, the
- * marginal ratio of desiredvnodes to the physical memory size is
- * 1:64. However, desiredvnodes is limited by the kernel's heap
+ * physical memory size. The ratio of desiredvnodes to physical pages
+ * is one to four until desiredvnodes exceeds 98,304. Thereafter, the
+ * marginal ratio of desiredvnodes to physical pages is one to
+ * sixteen. However, desiredvnodes is limited by the kernel's heap
* size. The memory required by desiredvnodes vnodes and vm objects
- * must not exceed 1/7th of the kernel's heap size.
+ * may not exceed one seventh of the kernel's heap size.
*/
- physvnodes = maxproc + pgtok(cnt.v_page_count) / 64 +
- 3 * min(98304 * 16, pgtok(cnt.v_page_count)) / 64;
+ physvnodes = maxproc + cnt.v_page_count / 16 + 3 * min(98304 * 4,
+ cnt.v_page_count) / 16;
virtvnodes = vm_kmem_size / (7 * (sizeof(struct vm_object) +
sizeof(struct vnode)));
desiredvnodes = min(physvnodes, virtvnodes);
@@ -831,41 +802,35 @@ vattr_null(struct vattr *vap)
* you set kern.maxvnodes to. Do not set kern.maxvnodes too low.
*/
static int
-vlrureclaim(struct mount *mp, int reclaim_nc_src, int trigger)
+vlrureclaim(struct mount *mp)
{
struct vnode *vp;
- int count, done, target;
+ int done;
+ int trigger;
+ int usevnodes;
+ int count;
+ /*
+ * Calculate the trigger point, don't allow user
+ * screwups to blow us up. This prevents us from
+ * recycling vnodes with lots of resident pages. We
+ * aren't trying to free memory, we are trying to
+ * free vnodes.
+ */
+ usevnodes = desiredvnodes;
+ if (usevnodes <= 0)
+ usevnodes = 1;
+ trigger = cnt.v_page_count * 2 / usevnodes;
done = 0;
vn_start_write(NULL, &mp, V_WAIT);
MNT_ILOCK(mp);
- count = mp->mnt_nvnodelistsize;
- target = count * (int64_t)gapvnodes / imax(desiredvnodes, 1);
- target = target / 10 + 1;
- while (count != 0 && done < target) {
+ count = mp->mnt_nvnodelistsize / 10 + 1;
+ while (count != 0) {
vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
while (vp != NULL && vp->v_type == VMARKER)
vp = TAILQ_NEXT(vp, v_nmntvnodes);
if (vp == NULL)
break;
- /*
- * XXX LRU is completely broken for non-free vnodes. First
- * by calling here in mountpoint order, then by moving
- * unselected vnodes to the end here, and most grossly by
- * removing the vlruvp() function that was supposed to
- * maintain the order. (This function was born broken
- * since syncer problems prevented it doing anything.) The
- * order is closer to LRC (C = Created).
- *
- * LRU reclaiming of vnodes seems to have last worked in
- * FreeBSD-3 where LRU wasn't mentioned under any spelling.
- * Then there was no hold count, and inactive vnodes were
- * simply put on the free list in LRU order. The separate
- * lists also break LRU. We prefer to reclaim from the
- * free list for technical reasons. This tends to thrash
- * the free list to keep very unrecently used held vnodes.
- * The problem is mitigated by keeping the free list large.
- */
TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
--count;
@@ -874,12 +839,10 @@ vlrureclaim(struct mount *mp, int reclaim_nc_src, int trigger)
/*
* If it's been deconstructed already, it's still
* referenced, or it exceeds the trigger, skip it.
- * Also skip free vnodes. We are trying to make space
- * to expand the free list, not reduce it.
*/
if (vp->v_usecount ||
- (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) ||
- ((vp->v_iflag & VI_FREE) != 0) ||
+ (!vlru_allow_cache_src &&
+ !LIST_EMPTY(&(vp)->v_cache_src)) ||
(vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL &&
vp->v_object->resident_page_count > trigger)) {
VI_UNLOCK(vp);
@@ -905,8 +868,8 @@ vlrureclaim(struct mount *mp, int reclaim_nc_src, int trigger)
* vnode lock before our VOP_LOCK() call fails.
*/
if (vp->v_usecount ||
- (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) ||
- (vp->v_iflag & VI_FREE) != 0 ||
+ (!vlru_allow_cache_src &&
+ !LIST_EMPTY(&(vp)->v_cache_src)) ||
(vp->v_object != NULL &&
vp->v_object->resident_page_count > trigger)) {
VOP_UNLOCK(vp, LK_INTERLOCK);
@@ -939,7 +902,7 @@ relock_mnt:
}
/*
- * Attempt to reduce the free list by the requested amount.
+ * Attempt to keep the free list at wantfreevnodes length.
*/
static void
vnlru_free(int count)
@@ -996,24 +959,6 @@ vnlru_free(int count)
mtx_lock(&vnode_free_list_mtx);
}
}
-
-/* XXX some names and initialization are bad for limits and watermarks. */
-static int
-vspace(void)
-{
- int space;
-
- gapvnodes = imax(desiredvnodes - wantfreevnodes, 100);
- vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */
- vlowat = vhiwat / 2;
- if (numvnodes > desiredvnodes)
- return (0);
- space = desiredvnodes - numvnodes;
- if (freevnodes > wantfreevnodes)
- space += freevnodes - wantfreevnodes;
- return (space);
-}
-
/*
* Attempt to recycle vnodes in a context that is always safe to block.
* Calling vlrurecycle() from the bowels of filesystem code has some
@@ -1026,36 +971,18 @@ static void
vnlru_proc(void)
{
struct mount *mp, *nmp;
- unsigned long ofreevnodes, onumvnodes;
- int done, force, reclaim_nc_src, trigger, usevnodes;
+ int done;
+ struct proc *p = vnlruproc;
- EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc,
+ EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p,
SHUTDOWN_PRI_FIRST);
- force = 0;
for (;;) {
- kproc_suspend_check(vnlruproc);
+ kproc_suspend_check(p);
mtx_lock(&vnode_free_list_mtx);
- /*
- * If numvnodes is too large (due to desiredvnodes being
- * adjusted using its sysctl, or emergency growth), first
- * try to reduce it by discarding from the free list.
- */
- if (numvnodes > desiredvnodes && freevnodes > 0)
- vnlru_free(ulmin(numvnodes - desiredvnodes,
- freevnodes));
- /*
- * Sleep if the vnode cache is in a good state. This is
- * when it is not over-full and has space for about a 4%
- * or 9% expansion (by growing its size or inexcessively
- * reducing its free list). Otherwise, try to reclaim
- * space for a 10% expansion.
- */
- if (vstir && force == 0) {
- force = 1;
- vstir = 0;
- }
- if (vspace() >= vlowat && force == 0) {
+ if (freevnodes > wantfreevnodes)
+ vnlru_free(freevnodes - wantfreevnodes);
+ if (numvnodes <= desiredvnodes * 9 / 10) {
vnlruproc_sig = 0;
wakeup(&vnlruproc_sig);
msleep(vnlruproc, &vnode_free_list_mtx,
@@ -1064,66 +991,30 @@ vnlru_proc(void)
}
mtx_unlock(&vnode_free_list_mtx);
done = 0;
- ofreevnodes = freevnodes;
- onumvnodes = numvnodes;
- /*
- * Calculate parameters for recycling. These are the same
- * throughout the loop to give some semblance of fairness.
- * The trigger point is to avoid recycling vnodes with lots
- * of resident pages. We aren't trying to free memory; we
- * are trying to recycle or at least free vnodes.
- */
- if (numvnodes <= desiredvnodes)
- usevnodes = numvnodes - freevnodes;
- else
- usevnodes = numvnodes;
- if (usevnodes <= 0)
- usevnodes = 1;
- /*
- * The trigger value is is chosen to give a conservatively
- * large value to ensure that it alone doesn't prevent
- * making progress. The value can easily be so large that
- * it is effectively infinite in some congested and
- * misconfigured cases, and this is necessary. Normally
- * it is about 8 to 100 (pages), which is quite large.
- */
- trigger = cnt.v_page_count * 2 / usevnodes;
- if (force < 2)
- trigger = vsmalltrigger;
- reclaim_nc_src = force >= 3;
mtx_lock(&mountlist_mtx);
for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
nmp = TAILQ_NEXT(mp, mnt_list);
continue;
}
- done += vlrureclaim(mp, reclaim_nc_src, trigger);
+ done += vlrureclaim(mp);
mtx_lock(&mountlist_mtx);
nmp = TAILQ_NEXT(mp, mnt_list);
vfs_unbusy(mp);
}
mtx_unlock(&mountlist_mtx);
- if (onumvnodes > desiredvnodes && numvnodes <= desiredvnodes)
- uma_reclaim();
if (done == 0) {
- if (force == 0 || force == 1) {
- force = 2;
- continue;
- }
- if (force == 2) {
- force = 3;
- continue;
- }
- force = 0;
+#if 0
+ /* These messages are temporary debugging aids */
+ if (vnlru_nowhere < 5)
+ printf("vnlru process getting nowhere..\n");
+ else if (vnlru_nowhere == 5)
+ printf("vnlru process messages stopped.\n");
+#endif
vnlru_nowhere++;
tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
} else
kern_yield(PRI_USER);
- /*
- * After becoming active to expand above low water, keep
- * active until above high water.
- */
- force = vspace() < vhiwat;
}
}
@@ -1197,31 +1088,22 @@ vtryrecycle(struct vnode *vp)
return (0);
}
-static void
-vcheckspace(void)
-{
-
- if (vspace() < vlowat && vnlruproc_sig == 0) {
- vnlruproc_sig = 1;
- wakeup(vnlruproc);
- }
-}
-
/*
- * Wait if necessary for space for a new vnode.
+ * Wait for available vnodes.
*/
static int
getnewvnode_wait(int suspended)
{
mtx_assert(&vnode_free_list_mtx, MA_OWNED);
- if (numvnodes >= desiredvnodes) {
+ if (numvnodes > desiredvnodes) {
if (suspended) {
/*
- * The file system is being suspended. We cannot
- * risk a deadlock here, so allow allocation of
- * another vnode even if this would give too many.
+ * File system is beeing suspended, we cannot risk a
+ * deadlock here, so allocate new vnode anyway.
*/
+ if (freevnodes > wantfreevnodes)
+ vnlru_free(freevnodes - wantfreevnodes);
return (0);
}
if (vnlruproc_sig == 0) {
@@ -1231,34 +1113,18 @@ getnewvnode_wait(int suspended)
msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS,
"vlruwk", hz);
}
- /* Post-adjust like the pre-adjust in getnewvnode(). */
- if (numvnodes + 1 > desiredvnodes && freevnodes > 1)
- vnlru_free(1);
- return (numvnodes >= desiredvnodes ? ENFILE : 0);
+ return (numvnodes > desiredvnodes ? ENFILE : 0);
}
-/*
- * This hack is fragile, and probably not needed any more now that the
- * watermark handling works.
- */
void
getnewvnode_reserve(u_int count)
{
struct thread *td;
- /* Pre-adjust like the pre-adjust in getnewvnode(), with any count. */
- /* XXX no longer so quick, but this part is not racy. */
- mtx_lock(&vnode_free_list_mtx);
- if (numvnodes + count > desiredvnodes && freevnodes > wantfreevnodes)
- vnlru_free(ulmin(numvnodes + count - desiredvnodes,
- freevnodes - wantfreevnodes));
- mtx_unlock(&vnode_free_list_mtx);
-
td = curthread;
/* First try to be quick and racy. */
if (atomic_fetchadd_long(&numvnodes, count) + count <= desiredvnodes) {
td->td_vp_reserv += count;
- vcheckspace(); /* XXX no longer so quick, but more racy */
return;
} else
atomic_subtract_long(&numvnodes, count);
@@ -1271,18 +1137,9 @@ getnewvnode_reserve(u_int count)
atomic_add_long(&numvnodes, 1);
}
}
- vcheckspace();
mtx_unlock(&vnode_free_list_mtx);
}
-/*
- * This hack is fragile, especially if desiredvnodes or wantvnodes are
- * misconfgured or changed significantly. Reducing desiredvnodes below
- * the reserved amount should cause bizarre behaviour like reducing it
- * below the number of active vnodes -- the system will try to reduce
- * numvnodes to match, but should fail, so the subtraction below should
- * not overflow.
- */
void
getnewvnode_drop_reserve(void)
{
@@ -1303,7 +1160,6 @@ getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
struct vnode *vp;
struct thread *td;
struct lock_object *lo;
- static int cyclecount;
int error;
CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag);
@@ -1314,37 +1170,19 @@ getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
goto alloc;
}
mtx_lock(&vnode_free_list_mtx);
- if (numvnodes < desiredvnodes)
- cyclecount = 0;
- else if (cyclecount++ >= freevnodes) {
- cyclecount = 0;
- vstir = 1;
- }
/*
- * Grow the vnode cache if it will not be above its target max
- * after growing. Otherwise, if the free list is nonempty, try
- * to reclaim 1 item from it before growing the cache (possibly
- * above its target max if the reclamation failed or is delayed).
- * Otherwise, wait for some space. In all cases, schedule
- * vnlru_proc() if we are getting short of space. The watermarks
- * should be chosen so that we never wait or even reclaim from
- * the free list to below its target minimum.
+ * Lend our context to reclaim vnodes if they've exceeded the max.
*/
- if (numvnodes + 1 <= desiredvnodes)
- ;
- else if (freevnodes > 0)
+ if (freevnodes > wantfreevnodes)
vnlru_free(1);
- else {
- error = getnewvnode_wait(mp != NULL && (mp->mnt_kern_flag &
- MNTK_SUSPEND));
+ error = getnewvnode_wait(mp != NULL && (mp->mnt_kern_flag &
+ MNTK_SUSPEND));
#if 0 /* XXX Not all VFS_VGET/ffs_vget callers check returns. */
- if (error != 0) {
- mtx_unlock(&vnode_free_list_mtx);
- return (error);
- }
-#endif
+ if (error != 0) {
+ mtx_unlock(&vnode_free_list_mtx);
+ return (error);
}
- vcheckspace();
+#endif
atomic_add_long(&numvnodes, 1);
mtx_unlock(&vnode_free_list_mtx);
alloc:
OpenPOWER on IntegriCloud