diff options
author | julian <julian@FreeBSD.org> | 1998-05-19 21:45:53 +0000 |
---|---|---|
committer | julian <julian@FreeBSD.org> | 1998-05-19 21:45:53 +0000 |
commit | 0cc808ba0d80f670fdbfdad8596b352a02661706 (patch) | |
tree | ec3a27afcc6ad001da6ddc8d4a48b41358f52290 /contrib/sys | |
parent | 9feb27237255a180d341b3412632902b6444e5db (diff) | |
download | FreeBSD-src-0cc808ba0d80f670fdbfdad8596b352a02661706.zip FreeBSD-src-0cc808ba0d80f670fdbfdad8596b352a02661706.tar.gz |
Merge in Kirk's changes to stop softupdates from hogging all of memory.
Diffstat (limited to 'contrib/sys')
-rw-r--r-- | contrib/sys/softupdates/README | 66 | ||||
-rw-r--r-- | contrib/sys/softupdates/ffs_softdep.c | 246 | ||||
-rw-r--r-- | contrib/sys/softupdates/softdep.h | 49 |
3 files changed, 294 insertions, 67 deletions
diff --git a/contrib/sys/softupdates/README b/contrib/sys/softupdates/README index d4676c9..097eeca 100644 --- a/contrib/sys/softupdates/README +++ b/contrib/sys/softupdates/README @@ -249,3 +249,69 @@ code and installed the updated utilities, do the following: it and run `./doit'. You may want to check out each of the three subtests individually first: doit1 - andrew benchmarks, doit2 - copy and removal of /etc, doit3 - find from /. + +==== +Additional notes from Feb 13 + +hen removing huge directories of files, it is possible to get +the incore state arbitrarily far ahead of the disk. Maintaining +all the associated depedency information can exhaust the kernel +malloc arena. To avoid this senario, I have put some limits on +the soft update code so that it will not be allowed to rampage +through all of the kernel memory. I enclose below the relevant +patches to vnode.h and vfs_subr.c (which allow the soft update +code to speed up the filesystem syncer process). I have also +included the diffs for ffs_softdep.c. I hope to make a pass over +ffs_softdep.c to isolate the differences with my standard version +so that these diffs are less painful to incorporate. + +Since I know you like to play with tuning, I have put the relevant +knobs on sysctl debug variables. The tuning knobs can be viewed +with `sysctl debug' and set with `sysctl -w debug.<name>=value'. +The knobs are as follows: + + debug.max_softdeps - limit on any given resource + debug.tickdelay - ticks to delay before allocating + debug.max_limit_hit - number of times tickdelay imposed + debug.rush_requests - number of rush requests to filesystem syncer + +The max_softdeps limit is derived from vnodesdesired which in +turn is sized based on the amount of memory on the machine. +When the limit is hit, a process requesting a resource first +tries to speed up the filesystem syncer process. Such a +request is recorded as a rush_request. After syncdelay / 2 +unserviced rush requests (typically 15) are in the filesystem +syncers queue (i.e., it is more than 15 seconds behind in its +work), the process requesting the memory is put to sleep for +tickdelay seconds. Such a delay is recorded in max_limit_hit. +Following this delay it is granted its memory without further +delay. I have tried the following experiments in which I +delete an MH directory containing 16,703 files: + +Run # 1 2 3 + +max_softdeps 4496 4496 4496 +tickdelay 100 == 1 sec 20 == 0.2 sec 2 == 0.02 sec +max_limit_hit 16 == 16 sec 27 == 5.4 sec 203 == 4.1 sec +rush_requests 147 102 93 +run time 57 sec 46 sec 45 sec +I/O's 781 859 936 + +When run with no limits, it completes in 40 seconds. So, the +time spent in delay is directly added to the bottom line. +Shortening the tick delay does cut down the total running time, +but at the expense of generating more total I/O operations +due to the rush orders being sent to the filesystem syncer. +Although the number of rush orders decreases with a shorter +tick delay, there are more requests in each order, hence the +increase in I/O count. Also, although the I/O count does rise +with a shorter delay, it is still at least an order of magnitude +less than without soft updates. Anyway, you may want to play +around with these value to see what works best and to see if +you can get an insight into how best to tune them. If you get +out of memory panic's, then you have max_softdeps set too high. +The max_limit_hit and rush_requests show be reset to zero +before each run. The minimum legal value for tickdelay is 2 +(if you set it below that, the code will use 2). + + diff --git a/contrib/sys/softupdates/ffs_softdep.c b/contrib/sys/softupdates/ffs_softdep.c index 118689c..b86bb2a 100644 --- a/contrib/sys/softupdates/ffs_softdep.c +++ b/contrib/sys/softupdates/ffs_softdep.c @@ -53,7 +53,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)ffs_softdep.c 9.14 (McKusick) 1/15/98 + * @(#)ffs_softdep.c 9.21 (McKusick) 2/15/98 */ /* @@ -95,6 +95,7 @@ static int flush_pagedep_deps __P((struct vnode *, struct mount *, struct diraddhd *)); static int flush_inodedep_deps __P((struct fs *, ino_t)); static int handle_written_filepage __P((struct pagedep *, struct buf *)); +static void diradd_inode_written __P((struct diradd *, struct inodedep *)); static int handle_written_inodeblock __P((struct inodedep *, struct buf *)); static void handle_allocdirect_partdone __P((struct allocdirect *)); static void handle_allocindir_partdone __P((struct allocindir *)); @@ -129,6 +130,8 @@ static int newblk_lookup __P((struct fs *, ufs_daddr_t, int, static int inodedep_lookup __P((struct fs *, ino_t, int, struct inodedep **)); static int pagedep_lookup __P((struct inode *, ufs_lbn_t, int, struct pagedep **)); +static void pause_timer __P((void *)); +static int checklimit __P((long *, int)); static void add_to_worklist __P((struct worklist *)); /* @@ -427,6 +430,28 @@ workitem_free(item, type) */ static struct workhead softdep_workitem_pending; static int softdep_worklist_busy; +static int max_softdeps; /* maximum number of structs before slowdown */ +static int tickdelay = 2; /* number of ticks to pause during slowdown */ +static int max_limit_hit; /* number of times slowdown imposed */ +static int rush_requests; /* number of times I/O speeded up */ +static int proc_waiting; /* tracks whether we have a timeout posted */ +static pid_t filesys_syncer_pid;/* records pid of filesystem syncer process */ +#ifdef DEBUG +#include <vm/vm.h> +#include <sys/sysctl.h> +#if defined(__FreeBSD__) +SYSCTL_INT(_debug, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, ""); +SYSCTL_INT(_debug, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, ""); +SYSCTL_INT(_debug, OID_AUTO, max_limit_hit, CTLFLAG_RW, &max_limit_hit, 0, ""); +SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &rush_requests, 0, ""); +#else /* !__FreeBSD__ */ +struct ctldebug debug4 = { "max_softdeps", &max_softdeps }; +struct ctldebug debug5 = { "tickdelay", &tickdelay }; +struct ctldebug debug6 = { "max_limit_hit", &max_limit_hit }; +struct ctldebug debug7 = { "rush_requests", &rush_requests }; +#endif /* !__FreeBSD__ */ + +#endif /* DEBUG */ /* * Add an item to the end of the work queue. @@ -465,10 +490,16 @@ int softdep_process_worklist(matchmnt) struct mount *matchmnt; { + struct proc *p = curproc; struct worklist *wk; struct fs *matchfs; int matchcnt; + /* + * Record the process identifier of our caller so that we can + * give this process preferential treatment in checklimit below. + */ + filesys_syncer_pid = p->p_pid; matchcnt = 0; matchfs = NULL; if (matchmnt != NULL) @@ -592,6 +623,71 @@ softdep_flushfiles(oldmnt, flags, p) } /* + * A large burst of file addition or deletion activity can drive the + * memory load excessively high. Therefore we deliberately slow things + * down and speed up the I/O processing if we find ourselves with too + * many dependencies in progress. + */ +static int +checklimit(resource, islocked) + long *resource; + int islocked; +{ + struct proc *p = curproc; + + /* + * If we are under our limit, just proceed. + */ + if (*resource < max_softdeps) + return (0); + /* + * We never hold up the filesystem syncer process. + */ + if (p->p_pid == filesys_syncer_pid) + return (0); + /* + * Our first approach is to speed up the syncer process. + * We never push it to speed up more than half of its + * normal turn time, otherwise it could take over the cpu. + */ + if (rushjob < syncdelay / 2) { + rushjob += 1; + rush_requests += 1; + return (0); + } + /* + * Every trick has failed, so we pause momentarily to let + * the filesystem syncer process catch up. + */ + if (islocked == 0) + ACQUIRE_LOCK(&lk); + if (proc_waiting == 0) { + proc_waiting = 1; + timeout(pause_timer, NULL, tickdelay > 2 ? tickdelay : 2); + } + FREE_LOCK_INTERLOCKED(&lk); + (void) tsleep((caddr_t)&proc_waiting, PPAUSE | PCATCH, "softupdate", 0); + ACQUIRE_LOCK_INTERLOCKED(&lk); + if (islocked == 0) + FREE_LOCK(&lk); + max_limit_hit += 1; + return (1); +} + +/* + * Awaken processes pausing in checklimit and clear proc_waiting + * to indicate that there is no longer a timer running. + */ +void +pause_timer(arg) + void *arg; +{ + + proc_waiting = 0; + wakeup(&proc_waiting); +} + +/* * Structure hashing. * * There are three types of structures that can be looked up: @@ -690,7 +786,8 @@ top: * Structures and routines associated with inodedep caching. */ LIST_HEAD(inodedep_hashhead, inodedep) *inodedep_hashtbl; -u_long inodedep_hash; /* size of hash table - 1 */ +static u_long inodedep_hash; /* size of hash table - 1 */ +static long num_inodedep; /* number of inodedep allocated */ #define INODEDEP_HASH(fs, inum) \ (&inodedep_hashtbl[((((register_t)(fs)) >> 13) + (inum)) & inodedep_hash]) static struct sema inodedep_in_progress; @@ -710,11 +807,13 @@ inodedep_lookup(fs, inum, flags, inodedeppp) { struct inodedep *inodedep; struct inodedep_hashhead *inodedephd; + int firsttry; #ifdef DEBUG if (lk.lkt_held == -1) panic("inodedep_lookup: lock not held"); #endif + firsttry = 1; inodedephd = INODEDEP_HASH(fs, inum); top: for (inodedep = LIST_FIRST(inodedephd); inodedep; @@ -729,10 +828,15 @@ top: *inodedeppp = NULL; return (0); } + if (firsttry && checklimit(&num_inodedep, 1) == 1) { + firsttry = 0; + goto top; + } if (sema_get(&inodedep_in_progress, &lk) == 0) { ACQUIRE_LOCK(&lk); goto top; } + num_inodedep += 1; MALLOC(inodedep, struct inodedep *, sizeof(struct inodedep), M_INODEDEP, M_WAITOK); inodedep->id_list.wk_type = D_INODEDEP; @@ -745,6 +849,7 @@ top: inodedep->id_buf = NULL; LIST_INIT(&inodedep->id_pendinghd); LIST_INIT(&inodedep->id_inowait); + LIST_INIT(&inodedep->id_bufwait); TAILQ_INIT(&inodedep->id_inoupdt); TAILQ_INIT(&inodedep->id_newinoupdt); ACQUIRE_LOCK(&lk); @@ -815,11 +920,11 @@ softdep_initialize() LIST_INIT(&mkdirlisthd); LIST_INIT(&softdep_workitem_pending); - pagedep_hashtbl = hashinit(desiredvnodes / 10, M_PAGEDEP, + max_softdeps = desiredvnodes * 8; + pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP, &pagedep_hash); sema_init(&pagedep_in_progress, "pagedep", PRIBIO, 0); - inodedep_hashtbl = hashinit(desiredvnodes / 2, M_INODEDEP, - &inodedep_hash); + inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash); sema_init(&inodedep_in_progress, "inodedep", PRIBIO, 0); newblk_hashtbl = hashinit(64, M_NEWBLK, &newblk_hash); sema_init(&newblk_in_progress, "newblk", PRIBIO, 0); @@ -1452,6 +1557,7 @@ setup_allocindir_phase2(bp, ip, aip) * later release and zero the inode so that the calling routine * can release it. */ +static long num_freeblks; /* number of freeblks allocated */ void softdep_setup_freeblocks(ip, length) struct inode *ip; /* The inode whose length is to be reduced */ @@ -1468,6 +1574,8 @@ softdep_setup_freeblocks(ip, length) fs = ip->i_fs; if (length != 0) panic("softde_setup_freeblocks: non-zero length"); + (void) checklimit(&num_freeblks, 0); + num_freeblks += 1; MALLOC(freeblks, struct freeblks *, sizeof(struct freeblks), M_FREEBLKS, M_WAITOK); bzero(freeblks, sizeof(struct freeblks)); @@ -1511,7 +1619,7 @@ softdep_setup_freeblocks(ip, length) * Add the freeblks structure to the list of operations that * must await the zero'ed inode being written to disk. */ - WORKLIST_INSERT(&inodedep->id_inowait, &freeblks->fb_list); + WORKLIST_INSERT(&inodedep->id_bufwait, &freeblks->fb_list); /* * Because the file length has been truncated to zero, any * pending block allocation dependency structures associated @@ -1630,7 +1738,7 @@ deallocate_dependencies(bp, inodedep) if (inodedep == NULL) add_to_worklist(&dirrem->dm_list); else - WORKLIST_INSERT(&inodedep->id_inowait, + WORKLIST_INSERT(&inodedep->id_bufwait, &dirrem->dm_list); } WORKLIST_REMOVE(&pagedep->pd_list); @@ -1678,7 +1786,7 @@ free_allocdirect(adphead, adp, delay) WORKLIST_REMOVE(&adp->ad_list); if (adp->ad_freefrag != NULL) { if (delay) - WORKLIST_INSERT(&adp->ad_inodedep->id_inowait, + WORKLIST_INSERT(&adp->ad_inodedep->id_bufwait, &adp->ad_freefrag->ff_list); else add_to_worklist(&adp->ad_freefrag->ff_list); @@ -1690,6 +1798,7 @@ free_allocdirect(adphead, adp, delay) * Prepare an inode to be freed. The actual free operation is not * done until the zero'ed inode has been written to disk. */ +static long num_freefile; /* number of freefile allocated */ void softdep_freefile(pvp, ino, mode) struct vnode *pvp; @@ -1703,6 +1812,8 @@ softdep_freefile(pvp, ino, mode) /* * This sets up the inode de-allocation dependency. */ + (void) checklimit(&num_freefile, 0); + num_freefile += 1; MALLOC(freefile, struct freefile *, sizeof(struct freefile), M_FREEFILE, M_WAITOK); freefile->fx_list.wk_type = D_FREEFILE; @@ -1761,6 +1872,7 @@ free_inodedep(inodedep) if ((inodedep->id_state & ONWORKLIST) != 0 || (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE || LIST_FIRST(&inodedep->id_pendinghd) != NULL || + LIST_FIRST(&inodedep->id_bufwait) != NULL || LIST_FIRST(&inodedep->id_inowait) != NULL || TAILQ_FIRST(&inodedep->id_inoupdt) != NULL || TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL || @@ -1768,6 +1880,7 @@ free_inodedep(inodedep) return (0); LIST_REMOVE(inodedep, id_hash); WORKITEM_FREE(inodedep, D_INODEDEP); + num_inodedep -= 1; return (1); } @@ -1836,6 +1949,7 @@ handle_workitem_freeblocks(freeblks) softdep_error("handle_workitem_freeblks", allerror); #endif /* DIAGNOSTIC */ WORKITEM_FREE(freeblks, D_FREEBLKS); + num_freeblks -= 1; } /* @@ -1940,7 +2054,7 @@ free_allocindir(aip, inodedep) if (inodedep == NULL) add_to_worklist(&freefrag->ff_list); else - WORKLIST_INSERT(&inodedep->id_inowait, + WORKLIST_INSERT(&inodedep->id_bufwait, &freefrag->ff_list); } WORKITEM_FREE(aip, D_ALLOCINDIR); @@ -2038,23 +2152,27 @@ softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp) WORKITEM_FREE(mkdir2, D_MKDIR); } else { LIST_INSERT_HEAD(&mkdirlisthd, mkdir2, md_mkdirs); - WORKLIST_INSERT(&inodedep->id_inowait,&mkdir2->md_list); + WORKLIST_INSERT(&inodedep->id_bufwait,&mkdir2->md_list); } } /* - * Link into parent directory pagedep and new inode inodedep - * structures to await its being written. + * Link into parent directory pagedep to await its being written. */ if (pagedep_lookup(dp, lbn, DEPALLOC, &pagedep) == 0) WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); dap->da_pagedep = pagedep; LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap, da_pdlist); - if (inodedep_lookup(fs, newinum, DEPALLOC, &inodedep) == 1 && - (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) - WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); + /* + * Link into its inodedep. Put it on the id_bufwait list if the inode + * is not yet written. If it is written, do the post-inode write + * processing to put it on the id_pendinghd list. + */ + (void) inodedep_lookup(fs, newinum, DEPALLOC, &inodedep); + if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) + diradd_inode_written(dap, inodedep); else - WORKLIST_INSERT(&inodedep->id_inowait, &dap->da_list); + WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); FREE_LOCK(&lk); } @@ -2314,7 +2432,7 @@ softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) LIST_INSERT_HEAD( &dirrem->dm_pagedep->pd_diraddhd[DIRADDHASH(offset)], dap, da_pdlist); - WORKLIST_INSERT(&inodedep->id_inowait, &dap->da_list); + WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); } else if ((dirrem->dm_state & COMPLETE) == 0) { LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem, dm_next); @@ -2366,12 +2484,8 @@ handle_workitem_remove(dirrem) */ if ((dirrem->dm_state & RMDIR) == 0) { ip->i_nlink--; - if (ip->i_nlink < ip->i_effnlink) { -#ifdef DIAGNOSTIC - vprint("handle_workitem_remove: bad file delta", vp); -#endif - ip->i_effnlink = ip->i_nlink; - } + if (ip->i_nlink < ip->i_effnlink) + panic("handle_workitem_remove: bad file delta"); ip->i_flag |= IN_CHANGE; vput(vp); WORKITEM_FREE(dirrem, D_DIRREM); @@ -2436,6 +2550,7 @@ handle_workitem_freefile(freefile) if ((error = ffs_freefile(&vp, freefile->fx_oldinum, freefile->fx_mode)) != 0) softdep_error("handle_workitem_freefile", error); WORKITEM_FREE(freefile, D_FREEFILE); + num_freefile -= 1; } /* @@ -3022,7 +3137,7 @@ handle_written_inodeblock(inodedep, bp) * before the old ones have been deleted. */ filefree = NULL; - while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) { + while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) { WORKLIST_REMOVE(wk); switch (wk->wk_type) { @@ -3043,18 +3158,7 @@ handle_written_inodeblock(inodedep, bp) continue; case D_DIRADD: - dap = WK_DIRADD(wk); - dap->da_state |= COMPLETE; - if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { - if (dap->da_state & DIRCHG) - pagedep = dap->da_previous->dm_pagedep; - else - pagedep = dap->da_pagedep; - LIST_REMOVE(dap, da_pdlist); - LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, - da_pdlist); - } - WORKLIST_INSERT(&inodedep->id_pendinghd, wk); + diradd_inode_written(WK_DIRADD(wk), inodedep); continue; case D_FREEBLKS: @@ -3069,8 +3173,12 @@ handle_written_inodeblock(inodedep, bp) /* NOTREACHED */ } } - if (filefree != NULL) + if (filefree != NULL) { + if (free_inodedep(inodedep) == 0) + panic("handle_written_inodeblock: live inodedep"); add_to_worklist(filefree); + return (0); + } /* * If no outstanding dependencies, free it. @@ -3081,6 +3189,29 @@ handle_written_inodeblock(inodedep, bp) } /* + * Process a diradd entry after its dependent inode has been written. + * This routine must be called with splbio interrupts blocked. + */ +static void +diradd_inode_written(dap, inodedep) + struct diradd *dap; + struct inodedep *inodedep; +{ + struct pagedep *pagedep; + + dap->da_state |= COMPLETE; + if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { + if (dap->da_state & DIRCHG) + pagedep = dap->da_previous->dm_pagedep; + else + pagedep = dap->da_pagedep; + LIST_REMOVE(dap, da_pdlist); + LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); + } + WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); +} + +/* * Handle the completion of a mkdir dependency. */ static void @@ -3229,6 +3360,7 @@ softdep_load_inodeblock(ip) } if (inodedep->id_nlinkdelta != 0) { ip->i_effnlink -= inodedep->id_nlinkdelta; + ip->i_flag |= IN_MODIFIED; inodedep->id_nlinkdelta = 0; (void) free_inodedep(inodedep); } @@ -3252,6 +3384,7 @@ softdep_update_inodeblock(ip, bp, waitfor) int waitfor; /* 1 => update must be allowed */ { struct inodedep *inodedep; + struct worklist *wk; int error, gotit; /* @@ -3273,15 +3406,6 @@ softdep_update_inodeblock(ip, bp, waitfor) panic("softdep_update_inodeblock: bad delta"); inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; /* - * If the last remaining use for the inodedep was to track the - * link count, and there is no difference between the effective - * and actual link count, then we can free the inodedep. - */ - if (free_inodedep(inodedep)) { - FREE_LOCK(&lk); - return; - } - /* * Changes have been initiated. Anything depending on these * changes cannot occur until this inode has been written. */ @@ -3298,6 +3422,16 @@ softdep_update_inodeblock(ip, bp, waitfor) if (TAILQ_FIRST(&inodedep->id_inoupdt) != NULL) handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt)); /* + * Now that the inode has been pushed into the buffer, the + * operations dependent on the inode being written to disk + * can be moved to the id_bufwait so that they will be + * processed when the buffer I/O completes. + */ + while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) { + WORKLIST_REMOVE(wk); + WORKLIST_INSERT(&inodedep->id_bufwait, wk); + } + /* * Newly allocated inodes cannot be written until the bitmap * that allocates them have been written (indicated by * DEPCOMPLETE being set in id_state). If we are doing a @@ -3378,6 +3512,7 @@ softdep_fsync(vp) if (inodedep_lookup(fs, ip->i_number, 0, &inodedep) == 0) break; if (LIST_FIRST(&inodedep->id_inowait) != NULL || + LIST_FIRST(&inodedep->id_bufwait) != NULL || TAILQ_FIRST(&inodedep->id_inoupdt) != NULL || TAILQ_FIRST(&inodedep->id_newinoupdt) != NULL) panic("softdep_fsync: pending ops"); @@ -3444,8 +3579,8 @@ softdep_fsync(vp) */ error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), p->p_ucred, &bp); - vput(pvp); ret = VOP_BWRITE(bp); + vput(pvp); if (error != 0) return (error); if (ret != 0) @@ -3535,8 +3670,13 @@ loop: if (adp->ad_state & DEPCOMPLETE) break; nbp = adp->ad_buf; - if (getdirtybuf(&nbp, waitfor) == 0) + if (getdirtybuf(&nbp, waitfor) == 0) { +#if 0 /* [JRE] I suspect this should be here XXX */ + if (waitfor == MNT_NOWAIT) + continue; +#endif break; + } FREE_LOCK(&lk); if (waitfor == MNT_NOWAIT) { bawrite(nbp); @@ -3713,8 +3853,11 @@ flush_inodedep_deps(fs, ino) if (adp->ad_state & DEPCOMPLETE) continue; bp = adp->ad_buf; - if (getdirtybuf(&bp, waitfor) == 0) + if (getdirtybuf(&bp, waitfor) == 0) { + if (waitfor == MNT_NOWAIT) + continue; break; + } FREE_LOCK(&lk); if (waitfor == MNT_NOWAIT) { bawrite(bp); @@ -3732,8 +3875,11 @@ flush_inodedep_deps(fs, ino) if (adp->ad_state & DEPCOMPLETE) continue; bp = adp->ad_buf; - if (getdirtybuf(&bp, waitfor) == 0) + if (getdirtybuf(&bp, waitfor) == 0) { + if (waitfor == MNT_NOWAIT) + continue; break; + } FREE_LOCK(&lk); if (waitfor == MNT_NOWAIT) { bawrite(bp); diff --git a/contrib/sys/softupdates/softdep.h b/contrib/sys/softupdates/softdep.h index 6b5d416..3435f63 100644 --- a/contrib/sys/softupdates/softdep.h +++ b/contrib/sys/softupdates/softdep.h @@ -52,7 +52,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * from: @(#)softdep.h 9.4 (McKusick) 1/15/98 + * @(#)softdep.h 9.5 (McKusick) 2/11/98 */ #include <sys/queue.h> @@ -201,31 +201,45 @@ struct pagedep { /* * The "inodedep" structure tracks the set of dependencies associated - * with an inode. Each block that is allocated is represented by an + * with an inode. One task that it must manage is delayed operations + * (i.e., work requests that must be held until the inodedep's associated + * inode has been written to disk). Getting an inode from its incore + * state to the disk requires two steps to be taken by the filesystem + * in this order: first the inode must be copied to its disk buffer by + * the VOP_UPDATE operation; second the inode's buffer must be written + * to disk. To ensure that both operations have happened in the required + * order, the inodedep maintains two lists. Delayed operations are + * placed on the id_inowait list. When the VOP_UPDATE is done, all + * operations on the id_inowait list are moved to the id_bufwait list. + * When the buffer is written, the items on the id_bufwait list can be + * safely moved to the work queue to be processed. A second task of the + * inodedep structure is to track the status of block allocation within + * the inode. Each block that is allocated is represented by an * "allocdirect" structure (see below). It is linked onto the id_newinoupdt * list until both its contents and its allocation in the cylinder - * group map have been written to disk. Once the dependencies have been + * group map have been written to disk. Once these dependencies have been * satisfied, it is removed from the id_newinoupdt list and any followup * actions such as releasing the previous block or fragment are placed - * on the id_inowait list. When an inode is updated (copied from the - * in-core inode structure to a disk buffer containing its on-disk - * copy), the "inodedep" structure is linked onto the buffer through - * its worklist. Thus it will be notified when the buffer is about + * on the id_inowait list. When an inode is updated (a VOP_UPDATE is + * done), the "inodedep" structure is linked onto the buffer through + * its worklist. Thus, it will be notified when the buffer is about * to be written and when it is done. At the update time, all the * elements on the id_newinoupdt list are moved to the id_inoupdt list * since those changes are now relevant to the copy of the inode in the - * buffer. When the buffer containing the inode is written to disk, any - * updates listed on the id_inoupdt list are rolled back as they are - * not yet safe. Following the write, the changes are once again rolled - * forward and any actions on the id_inowait list are processed (since - * the previously allocated blocks are no longer claimed on the disk). + * buffer. Also at update time, the tasks on the id_inowait list are + * moved to the id_bufwait list so that they will be executed when + * the updated inode has been written to disk. When the buffer containing + * the inode is written to disk, any updates listed on the id_inoupdt + * list are rolled back as they are not yet safe. Following the write, + * the changes are once again rolled forward and any actions on the + * id_bufwait list are processed (since those actions are now safe). * The entries on the id_inoupdt and id_newinoupdt lists must be kept * sorted by logical block number to speed the calculation of the size * of the rolled back inode (see explanation in initiate_write_inodeblock). * When a directory entry is created, it is represented by a diradd. - * The diradd is added to the id_inowait list and is not permitted to be - * written to disk until the inode that it represents is written. After - * the inode is written, the id_inowait list is processed and the diradd + * The diradd is added to the id_inowait list as it cannot be safely + * written to disk until the inode that it represents is on disk. After + * the inode is written, the id_bufwait list is processed and the diradd * entries are moved to the id_pendinghd list where they remain until * the directory block containing the name has been written to disk. * The purpose of keeping the entries on the id_pendinghd list is so that @@ -244,7 +258,8 @@ struct inodedep { struct buf *id_buf; /* related bmsafemap (if pending) */ off_t id_savedsize; /* file size saved during rollback */ struct workhead id_pendinghd; /* entries awaiting directory write */ - struct workhead id_inowait; /* operations after inode written */ + struct workhead id_bufwait; /* operations after inode written */ + struct workhead id_inowait; /* operations waiting inode update */ struct allocdirectlst id_inoupdt; /* updates before inode written */ struct allocdirectlst id_newinoupdt; /* updates when inode written */ }; @@ -460,7 +475,7 @@ struct freefile { * if appropriate and is never cleared. */ struct diradd { - struct worklist da_list; /* id_inowait and id_pendinghd list */ + struct worklist da_list; /* id_inowait or id_pendinghd list */ # define da_state da_list.wk_state /* state of the new directory entry */ LIST_ENTRY(diradd) da_pdlist; /* pagedep holding directory block */ doff_t da_offset; /* offset of new dir entry in dir blk */ |