summaryrefslogtreecommitdiffstats
path: root/contrib/sys
diff options
context:
space:
mode:
authorjulian <julian@FreeBSD.org>1998-05-19 21:45:53 +0000
committerjulian <julian@FreeBSD.org>1998-05-19 21:45:53 +0000
commit0cc808ba0d80f670fdbfdad8596b352a02661706 (patch)
treeec3a27afcc6ad001da6ddc8d4a48b41358f52290 /contrib/sys
parent9feb27237255a180d341b3412632902b6444e5db (diff)
downloadFreeBSD-src-0cc808ba0d80f670fdbfdad8596b352a02661706.zip
FreeBSD-src-0cc808ba0d80f670fdbfdad8596b352a02661706.tar.gz
Merge in Kirk's changes to stop softupdates from hogging all of memory.
Diffstat (limited to 'contrib/sys')
-rw-r--r--contrib/sys/softupdates/README66
-rw-r--r--contrib/sys/softupdates/ffs_softdep.c246
-rw-r--r--contrib/sys/softupdates/softdep.h49
3 files changed, 294 insertions, 67 deletions
diff --git a/contrib/sys/softupdates/README b/contrib/sys/softupdates/README
index d4676c9..097eeca 100644
--- a/contrib/sys/softupdates/README
+++ b/contrib/sys/softupdates/README
@@ -249,3 +249,69 @@ code and installed the updated utilities, do the following:
it and run `./doit'. You may want to check out each of the
three subtests individually first: doit1 - andrew benchmarks,
doit2 - copy and removal of /etc, doit3 - find from /.
+
+====
+Additional notes from Feb 13
+
+hen removing huge directories of files, it is possible to get
+the incore state arbitrarily far ahead of the disk. Maintaining
+all the associated depedency information can exhaust the kernel
+malloc arena. To avoid this senario, I have put some limits on
+the soft update code so that it will not be allowed to rampage
+through all of the kernel memory. I enclose below the relevant
+patches to vnode.h and vfs_subr.c (which allow the soft update
+code to speed up the filesystem syncer process). I have also
+included the diffs for ffs_softdep.c. I hope to make a pass over
+ffs_softdep.c to isolate the differences with my standard version
+so that these diffs are less painful to incorporate.
+
+Since I know you like to play with tuning, I have put the relevant
+knobs on sysctl debug variables. The tuning knobs can be viewed
+with `sysctl debug' and set with `sysctl -w debug.<name>=value'.
+The knobs are as follows:
+
+ debug.max_softdeps - limit on any given resource
+ debug.tickdelay - ticks to delay before allocating
+ debug.max_limit_hit - number of times tickdelay imposed
+ debug.rush_requests - number of rush requests to filesystem syncer
+
+The max_softdeps limit is derived from vnodesdesired which in
+turn is sized based on the amount of memory on the machine.
+When the limit is hit, a process requesting a resource first
+tries to speed up the filesystem syncer process. Such a
+request is recorded as a rush_request. After syncdelay / 2
+unserviced rush requests (typically 15) are in the filesystem
+syncers queue (i.e., it is more than 15 seconds behind in its
+work), the process requesting the memory is put to sleep for
+tickdelay seconds. Such a delay is recorded in max_limit_hit.
+Following this delay it is granted its memory without further
+delay. I have tried the following experiments in which I
+delete an MH directory containing 16,703 files:
+
+Run # 1 2 3
+
+max_softdeps 4496 4496 4496
+tickdelay 100 == 1 sec 20 == 0.2 sec 2 == 0.02 sec
+max_limit_hit 16 == 16 sec 27 == 5.4 sec 203 == 4.1 sec
+rush_requests 147 102 93
+run time 57 sec 46 sec 45 sec
+I/O's 781 859 936
+
+When run with no limits, it completes in 40 seconds. So, the
+time spent in delay is directly added to the bottom line.
+Shortening the tick delay does cut down the total running time,
+but at the expense of generating more total I/O operations
+due to the rush orders being sent to the filesystem syncer.
+Although the number of rush orders decreases with a shorter
+tick delay, there are more requests in each order, hence the
+increase in I/O count. Also, although the I/O count does rise
+with a shorter delay, it is still at least an order of magnitude
+less than without soft updates. Anyway, you may want to play
+around with these value to see what works best and to see if
+you can get an insight into how best to tune them. If you get
+out of memory panic's, then you have max_softdeps set too high.
+The max_limit_hit and rush_requests show be reset to zero
+before each run. The minimum legal value for tickdelay is 2
+(if you set it below that, the code will use 2).
+
+
diff --git a/contrib/sys/softupdates/ffs_softdep.c b/contrib/sys/softupdates/ffs_softdep.c
index 118689c..b86bb2a 100644
--- a/contrib/sys/softupdates/ffs_softdep.c
+++ b/contrib/sys/softupdates/ffs_softdep.c
@@ -53,7 +53,7 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * @(#)ffs_softdep.c 9.14 (McKusick) 1/15/98
+ * @(#)ffs_softdep.c 9.21 (McKusick) 2/15/98
*/
/*
@@ -95,6 +95,7 @@ static int flush_pagedep_deps __P((struct vnode *, struct mount *,
struct diraddhd *));
static int flush_inodedep_deps __P((struct fs *, ino_t));
static int handle_written_filepage __P((struct pagedep *, struct buf *));
+static void diradd_inode_written __P((struct diradd *, struct inodedep *));
static int handle_written_inodeblock __P((struct inodedep *, struct buf *));
static void handle_allocdirect_partdone __P((struct allocdirect *));
static void handle_allocindir_partdone __P((struct allocindir *));
@@ -129,6 +130,8 @@ static int newblk_lookup __P((struct fs *, ufs_daddr_t, int,
static int inodedep_lookup __P((struct fs *, ino_t, int, struct inodedep **));
static int pagedep_lookup __P((struct inode *, ufs_lbn_t, int,
struct pagedep **));
+static void pause_timer __P((void *));
+static int checklimit __P((long *, int));
static void add_to_worklist __P((struct worklist *));
/*
@@ -427,6 +430,28 @@ workitem_free(item, type)
*/
static struct workhead softdep_workitem_pending;
static int softdep_worklist_busy;
+static int max_softdeps; /* maximum number of structs before slowdown */
+static int tickdelay = 2; /* number of ticks to pause during slowdown */
+static int max_limit_hit; /* number of times slowdown imposed */
+static int rush_requests; /* number of times I/O speeded up */
+static int proc_waiting; /* tracks whether we have a timeout posted */
+static pid_t filesys_syncer_pid;/* records pid of filesystem syncer process */
+#ifdef DEBUG
+#include <vm/vm.h>
+#include <sys/sysctl.h>
+#if defined(__FreeBSD__)
+SYSCTL_INT(_debug, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, "");
+SYSCTL_INT(_debug, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, "");
+SYSCTL_INT(_debug, OID_AUTO, max_limit_hit, CTLFLAG_RW, &max_limit_hit, 0, "");
+SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &rush_requests, 0, "");
+#else /* !__FreeBSD__ */
+struct ctldebug debug4 = { "max_softdeps", &max_softdeps };
+struct ctldebug debug5 = { "tickdelay", &tickdelay };
+struct ctldebug debug6 = { "max_limit_hit", &max_limit_hit };
+struct ctldebug debug7 = { "rush_requests", &rush_requests };
+#endif /* !__FreeBSD__ */
+
+#endif /* DEBUG */
/*
* Add an item to the end of the work queue.
@@ -465,10 +490,16 @@ int
softdep_process_worklist(matchmnt)
struct mount *matchmnt;
{
+ struct proc *p = curproc;
struct worklist *wk;
struct fs *matchfs;
int matchcnt;
+ /*
+ * Record the process identifier of our caller so that we can
+ * give this process preferential treatment in checklimit below.
+ */
+ filesys_syncer_pid = p->p_pid;
matchcnt = 0;
matchfs = NULL;
if (matchmnt != NULL)
@@ -592,6 +623,71 @@ softdep_flushfiles(oldmnt, flags, p)
}
/*
+ * A large burst of file addition or deletion activity can drive the
+ * memory load excessively high. Therefore we deliberately slow things
+ * down and speed up the I/O processing if we find ourselves with too
+ * many dependencies in progress.
+ */
+static int
+checklimit(resource, islocked)
+ long *resource;
+ int islocked;
+{
+ struct proc *p = curproc;
+
+ /*
+ * If we are under our limit, just proceed.
+ */
+ if (*resource < max_softdeps)
+ return (0);
+ /*
+ * We never hold up the filesystem syncer process.
+ */
+ if (p->p_pid == filesys_syncer_pid)
+ return (0);
+ /*
+ * Our first approach is to speed up the syncer process.
+ * We never push it to speed up more than half of its
+ * normal turn time, otherwise it could take over the cpu.
+ */
+ if (rushjob < syncdelay / 2) {
+ rushjob += 1;
+ rush_requests += 1;
+ return (0);
+ }
+ /*
+ * Every trick has failed, so we pause momentarily to let
+ * the filesystem syncer process catch up.
+ */
+ if (islocked == 0)
+ ACQUIRE_LOCK(&lk);
+ if (proc_waiting == 0) {
+ proc_waiting = 1;
+ timeout(pause_timer, NULL, tickdelay > 2 ? tickdelay : 2);
+ }
+ FREE_LOCK_INTERLOCKED(&lk);
+ (void) tsleep((caddr_t)&proc_waiting, PPAUSE | PCATCH, "softupdate", 0);
+ ACQUIRE_LOCK_INTERLOCKED(&lk);
+ if (islocked == 0)
+ FREE_LOCK(&lk);
+ max_limit_hit += 1;
+ return (1);
+}
+
+/*
+ * Awaken processes pausing in checklimit and clear proc_waiting
+ * to indicate that there is no longer a timer running.
+ */
+void
+pause_timer(arg)
+ void *arg;
+{
+
+ proc_waiting = 0;
+ wakeup(&proc_waiting);
+}
+
+/*
* Structure hashing.
*
* There are three types of structures that can be looked up:
@@ -690,7 +786,8 @@ top:
* Structures and routines associated with inodedep caching.
*/
LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl;
-u_long inodedep_hash; /* size of hash table - 1 */
+static u_long inodedep_hash; /* size of hash table - 1 */
+static long num_inodedep; /* number of inodedep allocated */
#define INODEDEP_HASH(fs, inum) \
(&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash])
static struct sema inodedep_in_progress;
@@ -710,11 +807,13 @@ inodedep_lookup(fs, inum, flags, inodedeppp)
{
struct inodedep *inodedep;
struct inodedep_hashhead *inodedephd;
+ int firsttry;
#ifdef DEBUG
if (lk.lkt_held == -1)
panic("inodedep_lookup: lock not held");
#endif
+ firsttry = 1;
inodedephd = INODEDEP_HASH(fs, inum);
top:
for (inodedep = LIST_FIRST(inodedephd); inodedep;
@@ -729,10 +828,15 @@ top:
*inodedeppp = NULL;
return (0);
}
+ if (firsttry && checklimit(&num_inodedep, 1) == 1) {
+ firsttry = 0;
+ goto top;
+ }
if (sema_get(&inodedep_in_progress, &lk) == 0) {
ACQUIRE_LOCK(&lk);
goto top;
}
+ num_inodedep += 1;
MALLOC(inodedep, struct inodedep *, sizeof(struct inodedep),
M_INODEDEP, M_WAITOK);
inodedep->id_list.wk_type = D_INODEDEP;
@@ -745,6 +849,7 @@ top:
inodedep->id_buf = NULL;
LIST_INIT(&inodedep->id_pendinghd);
LIST_INIT(&inodedep->id_inowait);
+ LIST_INIT(&inodedep->id_bufwait);
TAILQ_INIT(&inodedep->id_inoupdt);
TAILQ_INIT(&inodedep->id_newinoupdt);
ACQUIRE_LOCK(&lk);
@@ -815,11 +920,11 @@ softdep_initialize()
LIST_INIT(&mkdirlisthd);
LIST_INIT(&softdep_workitem_pending);
- pagedep_hashtbl = hashinit(desiredvnodes / 10, M_PAGEDEP,
+ max_softdeps = desiredvnodes * 8;
+ pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP,
&pagedep_hash);
sema_init(&pagedep_in_progress, "pagedep", PRIBIO, 0);
- inodedep_hashtbl = hashinit(desiredvnodes / 2, M_INODEDEP,
- &inodedep_hash);
+ inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash);
sema_init(&inodedep_in_progress, "inodedep", PRIBIO, 0);
newblk_hashtbl = hashinit(64, M_NEWBLK, &newblk_hash);
sema_init(&newblk_in_progress, "newblk", PRIBIO, 0);
@@ -1452,6 +1557,7 @@ setup_allocindir_phase2(bp, ip, aip)
* later release and zero the inode so that the calling routine
* can release it.
*/
+static long num_freeblks; /* number of freeblks allocated */
void
softdep_setup_freeblocks(ip, length)
struct inode *ip; /* The inode whose length is to be reduced */
@@ -1468,6 +1574,8 @@ softdep_setup_freeblocks(ip, length)
fs = ip->i_fs;
if (length != 0)
panic("softde_setup_freeblocks: non-zero length");
+ (void) checklimit(&num_freeblks, 0);
+ num_freeblks += 1;
MALLOC(freeblks, struct freeblks *, sizeof(struct freeblks),
M_FREEBLKS, M_WAITOK);
bzero(freeblks, sizeof(struct freeblks));
@@ -1511,7 +1619,7 @@ softdep_setup_freeblocks(ip, length)
* Add the freeblks structure to the list of operations that
* must await the zero'ed inode being written to disk.
*/
- WORKLIST_INSERT(&inodedep->id_inowait, &freeblks->fb_list);
+ WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list);
/*
* Because the file length has been truncated to zero, any
* pending block allocation dependency structures associated
@@ -1630,7 +1738,7 @@ deallocate_dependencies(bp, inodedep)
if (inodedep == NULL)
add_to_worklist(&dirrem->dm_list);
else
- WORKLIST_INSERT(&inodedep->id_inowait,
+ WORKLIST_INSERT(&inodedep->id_bufwait,
&dirrem->dm_list);
}
WORKLIST_REMOVE(&pagedep->pd_list);
@@ -1678,7 +1786,7 @@ free_allocdirect(adphead, adp, delay)
WORKLIST_REMOVE(&adp->ad_list);
if (adp->ad_freefrag != NULL) {
if (delay)
- WORKLIST_INSERT(&adp->ad_inodedep->id_inowait,
+ WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait,
&adp->ad_freefrag->ff_list);
else
add_to_worklist(&adp->ad_freefrag->ff_list);
@@ -1690,6 +1798,7 @@ free_allocdirect(adphead, adp, delay)
* Prepare an inode to be freed. The actual free operation is not
* done until the zero'ed inode has been written to disk.
*/
+static long num_freefile; /* number of freefile allocated */
void
softdep_freefile(pvp, ino, mode)
struct vnode *pvp;
@@ -1703,6 +1812,8 @@ softdep_freefile(pvp, ino, mode)
/*
* This sets up the inode de-allocation dependency.
*/
+ (void) checklimit(&num_freefile, 0);
+ num_freefile += 1;
MALLOC(freefile, struct freefile *, sizeof(struct freefile),
M_FREEFILE, M_WAITOK);
freefile->fx_list.wk_type = D_FREEFILE;
@@ -1761,6 +1872,7 @@ free_inodedep(inodedep)
if ((inodedep->id_state & ONWORKLIST) != 0 ||
(inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
LIST_FIRST(&inodedep->id_pendinghd) != NULL ||
+ LIST_FIRST(&inodedep->id_bufwait) != NULL ||
LIST_FIRST(&inodedep->id_inowait) != NULL ||
TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL ||
@@ -1768,6 +1880,7 @@ free_inodedep(inodedep)
return (0);
LIST_REMOVE(inodedep, id_hash);
WORKITEM_FREE(inodedep, D_INODEDEP);
+ num_inodedep -= 1;
return (1);
}
@@ -1836,6 +1949,7 @@ handle_workitem_freeblocks(freeblks)
softdep_error("handle_workitem_freeblks", allerror);
#endif /* DIAGNOSTIC */
WORKITEM_FREE(freeblks, D_FREEBLKS);
+ num_freeblks -= 1;
}
/*
@@ -1940,7 +2054,7 @@ free_allocindir(aip, inodedep)
if (inodedep == NULL)
add_to_worklist(&freefrag->ff_list);
else
- WORKLIST_INSERT(&inodedep->id_inowait,
+ WORKLIST_INSERT(&inodedep->id_bufwait,
&freefrag->ff_list);
}
WORKITEM_FREE(aip, D_ALLOCINDIR);
@@ -2038,23 +2152,27 @@ softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp)
WORKITEM_FREE(mkdir2, D_MKDIR);
} else {
LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs);
- WORKLIST_INSERT(&inodedep->id_inowait,&mkdir2->md_list);
+ WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list);
}
}
/*
- * Link into parent directory pagedep and new inode inodedep
- * structures to await its being written.
+ * Link into parent directory pagedep to await its being written.
*/
if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0)
WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
dap->da_pagedep = pagedep;
LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
da_pdlist);
- if (inodedep_lookup(fs, newinum, DEPALLOC, &inodedep) == 1 &&
- (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
- WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
+ /*
+ * Link into its inodedep. Put it on the id_bufwait list if the inode
+ * is not yet written. If it is written, do the post-inode write
+ * processing to put it on the id_pendinghd list.
+ */
+ (void) inodedep_lookup(fs, newinum, DEPALLOC, &inodedep);
+ if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
+ diradd_inode_written(dap, inodedep);
else
- WORKLIST_INSERT(&inodedep->id_inowait, &dap->da_list);
+ WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
FREE_LOCK(&lk);
}
@@ -2314,7 +2432,7 @@ softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
LIST_INSERT_HEAD(
&dirrem->dm_pagedep->pd_diraddhd[DIRADDHASH(offset)],
dap, da_pdlist);
- WORKLIST_INSERT(&inodedep->id_inowait, &dap->da_list);
+ WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
} else if ((dirrem->dm_state & COMPLETE) == 0) {
LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
dm_next);
@@ -2366,12 +2484,8 @@ handle_workitem_remove(dirrem)
*/
if ((dirrem->dm_state & RMDIR) == 0) {
ip->i_nlink--;
- if (ip->i_nlink < ip->i_effnlink) {
-#ifdef DIAGNOSTIC
- vprint("handle_workitem_remove: bad file delta", vp);
-#endif
- ip->i_effnlink = ip->i_nlink;
- }
+ if (ip->i_nlink < ip->i_effnlink)
+ panic("handle_workitem_remove: bad file delta");
ip->i_flag |= IN_CHANGE;
vput(vp);
WORKITEM_FREE(dirrem, D_DIRREM);
@@ -2436,6 +2550,7 @@ handle_workitem_freefile(freefile)
if ((error = ffs_freefile(&vp, freefile->fx_oldinum, freefile->fx_mode)) != 0)
softdep_error("handle_workitem_freefile", error);
WORKITEM_FREE(freefile, D_FREEFILE);
+ num_freefile -= 1;
}
/*
@@ -3022,7 +3137,7 @@ handle_written_inodeblock(inodedep, bp)
* before the old ones have been deleted.
*/
filefree = NULL;
- while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
+ while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
WORKLIST_REMOVE(wk);
switch (wk->wk_type) {
@@ -3043,18 +3158,7 @@ handle_written_inodeblock(inodedep, bp)
continue;
case D_DIRADD:
- dap = WK_DIRADD(wk);
- dap->da_state |= COMPLETE;
- if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
- if (dap->da_state & DIRCHG)
- pagedep = dap->da_previous->dm_pagedep;
- else
- pagedep = dap->da_pagedep;
- LIST_REMOVE(dap, da_pdlist);
- LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
- da_pdlist);
- }
- WORKLIST_INSERT(&inodedep->id_pendinghd, wk);
+ diradd_inode_written(WK_DIRADD(wk), inodedep);
continue;
case D_FREEBLKS:
@@ -3069,8 +3173,12 @@ handle_written_inodeblock(inodedep, bp)
/* NOTREACHED */
}
}
- if (filefree != NULL)
+ if (filefree != NULL) {
+ if (free_inodedep(inodedep) == 0)
+ panic("handle_written_inodeblock: live inodedep");
add_to_worklist(filefree);
+ return (0);
+ }
/*
* If no outstanding dependencies, free it.
@@ -3081,6 +3189,29 @@ handle_written_inodeblock(inodedep, bp)
}
/*
+ * Process a diradd entry after its dependent inode has been written.
+ * This routine must be called with splbio interrupts blocked.
+ */
+static void
+diradd_inode_written(dap, inodedep)
+ struct diradd *dap;
+ struct inodedep *inodedep;
+{
+ struct pagedep *pagedep;
+
+ dap->da_state |= COMPLETE;
+ if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
+ if (dap->da_state & DIRCHG)
+ pagedep = dap->da_previous->dm_pagedep;
+ else
+ pagedep = dap->da_pagedep;
+ LIST_REMOVE(dap, da_pdlist);
+ LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
+ }
+ WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
+}
+
+/*
* Handle the completion of a mkdir dependency.
*/
static void
@@ -3229,6 +3360,7 @@ softdep_load_inodeblock(ip)
}
if (inodedep->id_nlinkdelta != 0) {
ip->i_effnlink -= inodedep->id_nlinkdelta;
+ ip->i_flag |= IN_MODIFIED;
inodedep->id_nlinkdelta = 0;
(void) free_inodedep(inodedep);
}
@@ -3252,6 +3384,7 @@ softdep_update_inodeblock(ip, bp, waitfor)
int waitfor; /* 1 => update must be allowed */
{
struct inodedep *inodedep;
+ struct worklist *wk;
int error, gotit;
/*
@@ -3273,15 +3406,6 @@ softdep_update_inodeblock(ip, bp, waitfor)
panic("softdep_update_inodeblock: bad delta");
inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
/*
- * If the last remaining use for the inodedep was to track the
- * link count, and there is no difference between the effective
- * and actual link count, then we can free the inodedep.
- */
- if (free_inodedep(inodedep)) {
- FREE_LOCK(&lk);
- return;
- }
- /*
* Changes have been initiated. Anything depending on these
* changes cannot occur until this inode has been written.
*/
@@ -3298,6 +3422,16 @@ softdep_update_inodeblock(ip, bp, waitfor)
if (TAILQ_FIRST(&inodedep->id_inoupdt) != NULL)
handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt));
/*
+ * Now that the inode has been pushed into the buffer, the
+ * operations dependent on the inode being written to disk
+ * can be moved to the id_bufwait so that they will be
+ * processed when the buffer I/O completes.
+ */
+ while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
+ WORKLIST_REMOVE(wk);
+ WORKLIST_INSERT(&inodedep->id_bufwait, wk);
+ }
+ /*
* Newly allocated inodes cannot be written until the bitmap
* that allocates them have been written (indicated by
* DEPCOMPLETE being set in id_state). If we are doing a
@@ -3378,6 +3512,7 @@ softdep_fsync(vp)
if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0)
break;
if (LIST_FIRST(&inodedep->id_inowait) != NULL ||
+ LIST_FIRST(&inodedep->id_bufwait) != NULL ||
TAILQ_FIRST(&inodedep->id_inoupdt) != NULL ||
TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL)
panic("softdep_fsync: pending ops");
@@ -3444,8 +3579,8 @@ softdep_fsync(vp)
*/
error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), p->p_ucred,
&bp);
- vput(pvp);
ret = VOP_BWRITE(bp);
+ vput(pvp);
if (error != 0)
return (error);
if (ret != 0)
@@ -3535,8 +3670,13 @@ loop:
if (adp->ad_state & DEPCOMPLETE)
break;
nbp = adp->ad_buf;
- if (getdirtybuf(&nbp, waitfor) == 0)
+ if (getdirtybuf(&nbp, waitfor) == 0) {
+#if 0 /* [JRE] I suspect this should be here XXX */
+ if (waitfor == MNT_NOWAIT)
+ continue;
+#endif
break;
+ }
FREE_LOCK(&lk);
if (waitfor == MNT_NOWAIT) {
bawrite(nbp);
@@ -3713,8 +3853,11 @@ flush_inodedep_deps(fs, ino)
if (adp->ad_state & DEPCOMPLETE)
continue;
bp = adp->ad_buf;
- if (getdirtybuf(&bp, waitfor) == 0)
+ if (getdirtybuf(&bp, waitfor) == 0) {
+ if (waitfor == MNT_NOWAIT)
+ continue;
break;
+ }
FREE_LOCK(&lk);
if (waitfor == MNT_NOWAIT) {
bawrite(bp);
@@ -3732,8 +3875,11 @@ flush_inodedep_deps(fs, ino)
if (adp->ad_state & DEPCOMPLETE)
continue;
bp = adp->ad_buf;
- if (getdirtybuf(&bp, waitfor) == 0)
+ if (getdirtybuf(&bp, waitfor) == 0) {
+ if (waitfor == MNT_NOWAIT)
+ continue;
break;
+ }
FREE_LOCK(&lk);
if (waitfor == MNT_NOWAIT) {
bawrite(bp);
diff --git a/contrib/sys/softupdates/softdep.h b/contrib/sys/softupdates/softdep.h
index 6b5d416..3435f63 100644
--- a/contrib/sys/softupdates/softdep.h
+++ b/contrib/sys/softupdates/softdep.h
@@ -52,7 +52,7 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * from: @(#)softdep.h 9.4 (McKusick) 1/15/98
+ * @(#)softdep.h 9.5 (McKusick) 2/11/98
*/
#include <sys/queue.h>
@@ -201,31 +201,45 @@ struct pagedep {
/*
* The "inodedep" structure tracks the set of dependencies associated
- * with an inode. Each block that is allocated is represented by an
+ * with an inode. One task that it must manage is delayed operations
+ * (i.e., work requests that must be held until the inodedep's associated
+ * inode has been written to disk). Getting an inode from its incore
+ * state to the disk requires two steps to be taken by the filesystem
+ * in this order: first the inode must be copied to its disk buffer by
+ * the VOP_UPDATE operation; second the inode's buffer must be written
+ * to disk. To ensure that both operations have happened in the required
+ * order, the inodedep maintains two lists. Delayed operations are
+ * placed on the id_inowait list. When the VOP_UPDATE is done, all
+ * operations on the id_inowait list are moved to the id_bufwait list.
+ * When the buffer is written, the items on the id_bufwait list can be
+ * safely moved to the work queue to be processed. A second task of the
+ * inodedep structure is to track the status of block allocation within
+ * the inode. Each block that is allocated is represented by an
* "allocdirect" structure (see below). It is linked onto the id_newinoupdt
* list until both its contents and its allocation in the cylinder
- * group map have been written to disk. Once the dependencies have been
+ * group map have been written to disk. Once these dependencies have been
* satisfied, it is removed from the id_newinoupdt list and any followup
* actions such as releasing the previous block or fragment are placed
- * on the id_inowait list. When an inode is updated (copied from the
- * in-core inode structure to a disk buffer containing its on-disk
- * copy), the "inodedep" structure is linked onto the buffer through
- * its worklist. Thus it will be notified when the buffer is about
+ * on the id_inowait list. When an inode is updated (a VOP_UPDATE is
+ * done), the "inodedep" structure is linked onto the buffer through
+ * its worklist. Thus, it will be notified when the buffer is about
* to be written and when it is done. At the update time, all the
* elements on the id_newinoupdt list are moved to the id_inoupdt list
* since those changes are now relevant to the copy of the inode in the
- * buffer. When the buffer containing the inode is written to disk, any
- * updates listed on the id_inoupdt list are rolled back as they are
- * not yet safe. Following the write, the changes are once again rolled
- * forward and any actions on the id_inowait list are processed (since
- * the previously allocated blocks are no longer claimed on the disk).
+ * buffer. Also at update time, the tasks on the id_inowait list are
+ * moved to the id_bufwait list so that they will be executed when
+ * the updated inode has been written to disk. When the buffer containing
+ * the inode is written to disk, any updates listed on the id_inoupdt
+ * list are rolled back as they are not yet safe. Following the write,
+ * the changes are once again rolled forward and any actions on the
+ * id_bufwait list are processed (since those actions are now safe).
* The entries on the id_inoupdt and id_newinoupdt lists must be kept
* sorted by logical block number to speed the calculation of the size
* of the rolled back inode (see explanation in initiate_write_inodeblock).
* When a directory entry is created, it is represented by a diradd.
- * The diradd is added to the id_inowait list and is not permitted to be
- * written to disk until the inode that it represents is written. After
- * the inode is written, the id_inowait list is processed and the diradd
+ * The diradd is added to the id_inowait list as it cannot be safely
+ * written to disk until the inode that it represents is on disk. After
+ * the inode is written, the id_bufwait list is processed and the diradd
* entries are moved to the id_pendinghd list where they remain until
* the directory block containing the name has been written to disk.
* The purpose of keeping the entries on the id_pendinghd list is so that
@@ -244,7 +258,8 @@ struct inodedep {
struct buf *id_buf; /* related bmsafemap (if pending) */
off_t id_savedsize; /* file size saved during rollback */
struct workhead id_pendinghd; /* entries awaiting directory write */
- struct workhead id_inowait; /* operations after inode written */
+ struct workhead id_bufwait; /* operations after inode written */
+ struct workhead id_inowait; /* operations waiting inode update */
struct allocdirectlst id_inoupdt; /* updates before inode written */
struct allocdirectlst id_newinoupdt; /* updates when inode written */
};
@@ -460,7 +475,7 @@ struct freefile {
* if appropriate and is never cleared.
*/
struct diradd {
- struct worklist da_list; /* id_inowait and id_pendinghd list */
+ struct worklist da_list; /* id_inowait or id_pendinghd list */
# define da_state da_list.wk_state /* state of the new directory entry */
LIST_ENTRY(diradd) da_pdlist; /* pagedep holding directory block */
doff_t da_offset; /* offset of new dir entry in dir blk */
OpenPOWER on IntegriCloud