summaryrefslogtreecommitdiffstats
path: root/sys/ufs/ffs
diff options
context:
space:
mode:
Diffstat (limited to 'sys/ufs/ffs')
-rw-r--r--sys/ufs/ffs/ffs_softdep.c268
-rw-r--r--sys/ufs/ffs/softdep.h4
2 files changed, 215 insertions, 57 deletions
diff --git a/sys/ufs/ffs/ffs_softdep.c b/sys/ufs/ffs/ffs_softdep.c
index 4a0dbb1..8ef67df 100644
--- a/sys/ufs/ffs/ffs_softdep.c
+++ b/sys/ufs/ffs/ffs_softdep.c
@@ -753,8 +753,7 @@ static void handle_written_jnewblk(struct jnewblk *);
static void handle_written_jfreeblk(struct jfreeblk *);
static void handle_written_jfreefrag(struct jfreefrag *);
static void complete_jseg(struct jseg *);
-static void jseg_write(struct ufsmount *ump, struct jblocks *, struct jseg *,
- uint8_t *);
+static void jseg_write(struct ufsmount *ump, struct jseg *, uint8_t *);
static void jaddref_write(struct jaddref *, struct jseg *, uint8_t *);
static void jremref_write(struct jremref *, struct jseg *, uint8_t *);
static void jmvref_write(struct jmvref *, struct jseg *, uint8_t *);
@@ -769,6 +768,7 @@ static void handle_allocdirect_partdone(struct allocdirect *,
static struct jnewblk *cancel_newblk(struct newblk *, struct worklist *,
struct workhead *);
static void indirdep_complete(struct indirdep *);
+static int indirblk_inseg(struct mount *, ufs2_daddr_t);
static void handle_allocindir_partdone(struct allocindir *);
static void initiate_write_filepage(struct pagedep *, struct buf *);
static void initiate_write_indirdep(struct indirdep*, struct buf *);
@@ -802,7 +802,9 @@ static void free_newdirblk(struct newdirblk *);
static void free_jremref(struct jremref *);
static void free_jaddref(struct jaddref *);
static void free_jsegdep(struct jsegdep *);
-static void free_jseg(struct jseg *);
+static void free_jsegs(struct jblocks *);
+static void rele_jseg(struct jseg *);
+static void free_jseg(struct jseg *, struct jblocks *);
static void free_jnewblk(struct jnewblk *);
static void free_jfreeblk(struct jfreeblk *);
static void free_jfreefrag(struct jfreefrag *);
@@ -872,7 +874,7 @@ static int journal_unsuspend(struct ufsmount *ump);
static void softdep_prelink(struct vnode *, struct vnode *);
static void add_to_journal(struct worklist *);
static void remove_from_journal(struct worklist *);
-static void softdep_process_journal(struct mount *, int);
+static void softdep_process_journal(struct mount *, struct worklist *, int);
static struct jremref *newjremref(struct dirrem *, struct inode *,
struct inode *ip, off_t, nlink_t);
static struct jaddref *newjaddref(struct inode *, ino_t, off_t, int16_t,
@@ -1376,7 +1378,7 @@ softdep_process_worklist(mp, full)
ump = VFSTOUFS(mp);
ACQUIRE_LOCK(&lk);
starttime = time_second;
- softdep_process_journal(mp, full?MNT_WAIT:0);
+ softdep_process_journal(mp, NULL, full?MNT_WAIT:0);
while (ump->softdep_on_worklist > 0) {
if ((cnt = process_worklist_item(mp, LK_NOWAIT)) == -1)
break;
@@ -1999,6 +2001,37 @@ newblk_lookup(mp, newblkno, flags, newblkpp)
}
/*
+ * Structures and routines associated with indir caching.
+ */
+struct workhead *indir_hashtbl;
+u_long indir_hash; /* size of hash table - 1 */
+#define INDIR_HASH(mp, blkno) \
+ (&indir_hashtbl[((((register_t)(mp)) >> 13) + (blkno)) & indir_hash])
+
+static int
+indirblk_inseg(mp, blkno)
+ struct mount *mp;
+ ufs2_daddr_t blkno;
+{
+ struct freework *freework;
+ struct workhead *wkhd;
+ struct worklist *wk;
+
+ wkhd = INDIR_HASH(mp, blkno);
+ LIST_FOREACH(wk, wkhd, wk_list) {
+ freework = WK_FREEWORK(wk);
+ if (freework->fw_blkno == blkno &&
+ freework->fw_list.wk_mp == mp) {
+ LIST_REMOVE(freework, fw_next);
+ WORKLIST_REMOVE(&freework->fw_list);
+ WORKITEM_FREE(freework, D_FREEWORK);
+ return (1);
+ }
+ }
+ return (0);
+}
+
+/*
* Executed during filesystem system initialization before
* mounting any filesystems.
*/
@@ -2012,6 +2045,7 @@ softdep_initialize()
inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &inodedep_hash);
newblk_hashtbl = hashinit(desiredvnodes / 5, M_NEWBLK, &newblk_hash);
bmsafemap_hashtbl = hashinit(1024, M_BMSAFEMAP, &bmsafemap_hash);
+ indir_hashtbl = hashinit(desiredvnodes / 10, M_FREEWORK, &indir_hash);
/* initialise bioops hack */
bioops.io_start = softdep_disk_io_initiation;
@@ -2120,9 +2154,12 @@ softdep_unmount(mp)
struct jblocks {
struct jseglst jb_segs; /* TAILQ of current segments. */
struct jseg *jb_writeseg; /* Next write to complete. */
+ struct jseg *jb_oldestseg; /* Oldest segment with valid entries. */
struct jextent *jb_extent; /* Extent array. */
uint64_t jb_nextseq; /* Next sequence number. */
- uint64_t jb_oldestseq; /* Oldest active sequence number. */
+ uint64_t jb_oldestwrseq; /* Oldest written sequence number. */
+ uint8_t jb_needseg; /* Need a forced segment. */
+ uint8_t jb_suspended; /* Did journal suspend writes? */
int jb_avail; /* Available extents. */
int jb_used; /* Last used extent. */
int jb_head; /* Allocator head. */
@@ -2132,7 +2169,6 @@ struct jblocks {
int jb_min; /* Minimum free space. */
int jb_low; /* Low on space. */
int jb_age; /* Insertion time of oldest rec. */
- int jb_suspended; /* Did journal suspend writes? */
};
struct jextent {
@@ -2575,9 +2611,8 @@ softdep_prelink(dvp, vp)
}
static void
-jseg_write(ump, jblocks, jseg, data)
+jseg_write(ump, jseg, data)
struct ufsmount *ump;
- struct jblocks *jblocks;
struct jseg *jseg;
uint8_t *data;
{
@@ -2585,7 +2620,7 @@ jseg_write(ump, jblocks, jseg, data)
rec = (struct jsegrec *)data;
rec->jsr_seq = jseg->js_seq;
- rec->jsr_oldest = jblocks->jb_oldestseq;
+ rec->jsr_oldest = jseg->js_oldseq;
rec->jsr_cnt = jseg->js_cnt;
rec->jsr_blocks = jseg->js_size / ump->um_devvp->v_bufobj.bo_bsize;
rec->jsr_crc = 0;
@@ -2722,8 +2757,9 @@ jtrunc_write(jtrunc, jseg, data)
* Flush some journal records to disk.
*/
static void
-softdep_process_journal(mp, flags)
+softdep_process_journal(mp, needwk, flags)
struct mount *mp;
+ struct worklist *needwk;
int flags;
{
struct jblocks *jblocks;
@@ -2755,17 +2791,23 @@ softdep_process_journal(mp, flags)
jrecmin = (devbsize / JREC_SIZE) - 1; /* -1 for seg header */
jrecmax = (fs->fs_bsize / devbsize) * jrecmin;
segwritten = 0;
- while ((cnt = ump->softdep_on_journal) != 0) {
+ for (;;) {
+ cnt = ump->softdep_on_journal;
/*
- * Create a new segment to hold as many as 'cnt' journal
- * entries and add them to the segment. Notice cnt is
- * off by one to account for the space required by the
- * jsegrec. If we don't have a full block to log skip it
- * unless we haven't written anything.
+ * Criteria for writing a segment:
+ * 1) We have a full block.
+ * 2) We're called from jwait() and haven't found the
+ * journal item yet.
+ * 3) Always write if needseg is set.
+ * 4) If we are called from process_worklist and have
+ * not yet written anything we write a partial block
+ * to enforce a 1 second maximum latency on journal
+ * entries.
*/
- cnt++;
- if (cnt < jrecmax && segwritten)
+ if (cnt < (jrecmax - 1) && needwk == NULL &&
+ jblocks->jb_needseg == 0 && (segwritten || cnt == 0))
break;
+ cnt++;
/*
* Verify some free journal space. softdep_prealloc() should
* guarantee that we don't run out so this is indicative of
@@ -2783,6 +2825,7 @@ softdep_process_journal(mp, flags)
jseg = malloc(sizeof(*jseg), M_JSEG, M_SOFTDEP_FLAGS);
workitem_alloc(&jseg->js_list, D_JSEG, mp);
LIST_INIT(&jseg->js_entries);
+ LIST_INIT(&jseg->js_indirs);
jseg->js_state = ATTACHED;
jseg->js_jblocks = jblocks;
bp = geteblk(fs->fs_bsize, 0);
@@ -2794,7 +2837,8 @@ softdep_process_journal(mp, flags)
* the caller will loop if the entry it cares about is
* not written.
*/
- if (ump->softdep_on_journal == 0 || jblocks->jb_free == 0) {
+ cnt = ump->softdep_on_journal;
+ if (cnt + jblocks->jb_needseg == 0 || jblocks->jb_free == 0) {
bp->b_flags |= B_INVAL | B_NOCACHE;
WORKITEM_FREE(jseg, D_JSEG);
FREE_LOCK(&lk);
@@ -2806,8 +2850,9 @@ softdep_process_journal(mp, flags)
* Calculate the disk block size required for the available
* records rounded to the min size.
*/
- cnt = ump->softdep_on_journal;
- if (cnt < jrecmax)
+ if (cnt == 0)
+ size = devbsize;
+ else if (cnt < jrecmax)
size = howmany(cnt, jrecmin) * devbsize;
else
size = fs->fs_bsize;
@@ -2827,15 +2872,15 @@ softdep_process_journal(mp, flags)
* Initialize our jseg with cnt records. Assign the next
* sequence number to it and link it in-order.
*/
- cnt = MIN(ump->softdep_on_journal,
- (size / devbsize) * jrecmin);
+ cnt = MIN(cnt, (size / devbsize) * jrecmin);
jseg->js_buf = bp;
jseg->js_cnt = cnt;
jseg->js_refs = cnt + 1; /* Self ref. */
jseg->js_size = size;
jseg->js_seq = jblocks->jb_nextseq++;
- if (TAILQ_EMPTY(&jblocks->jb_segs))
- jblocks->jb_oldestseq = jseg->js_seq;
+ if (jblocks->jb_oldestseg == NULL)
+ jblocks->jb_oldestseg = jseg;
+ jseg->js_oldseq = jblocks->jb_oldestseg->js_seq;
TAILQ_INSERT_TAIL(&jblocks->jb_segs, jseg, js_next);
if (jblocks->jb_writeseg == NULL)
jblocks->jb_writeseg = jseg;
@@ -2846,12 +2891,16 @@ softdep_process_journal(mp, flags)
off = 0;
while ((wk = LIST_FIRST(&ump->softdep_journal_pending))
!= NULL) {
+ if (cnt == 0)
+ break;
/* Place a segment header on every device block. */
if ((off % devbsize) == 0) {
- jseg_write(ump, jblocks, jseg, data);
+ jseg_write(ump, jseg, data);
off += JREC_SIZE;
data = bp->b_data + off;
}
+ if (wk == needwk)
+ needwk = NULL;
remove_from_journal(wk);
wk->wk_state |= IOSTARTED;
WORKLIST_INSERT(&jseg->js_entries, wk);
@@ -2882,23 +2931,28 @@ softdep_process_journal(mp, flags)
TYPENAME(wk->wk_type));
/* NOTREACHED */
}
- if (--cnt == 0)
- break;
off += JREC_SIZE;
data = bp->b_data + off;
+ cnt--;
}
/*
* Write this one buffer and continue.
*/
+ segwritten = 1;
+ jblocks->jb_needseg = 0;
WORKLIST_INSERT(&bp->b_dep, &jseg->js_list);
FREE_LOCK(&lk);
BO_LOCK(bp->b_bufobj);
bgetvp(ump->um_devvp, bp);
BO_UNLOCK(bp->b_bufobj);
- if (flags == MNT_NOWAIT)
- bawrite(bp);
- else
+ /*
+ * We only do the blocking wait once we find the journal
+ * entry we're looking for.
+ */
+ if (needwk == NULL && flags & MNT_WAIT)
bwrite(bp);
+ else
+ bawrite(bp);
ACQUIRE_LOCK(&lk);
}
/*
@@ -2949,7 +3003,7 @@ complete_jseg(jseg)
break;
case D_JMVREF:
/* No jsegdep here. */
- free_jseg(jseg);
+ rele_jseg(jseg);
jmvref = WK_JMVREF(wk);
LIST_REMOVE(jmvref, jm_deps);
free_pagedep(jmvref->jm_pagedep);
@@ -2977,7 +3031,7 @@ complete_jseg(jseg)
wakeup(wk);
}
/* Release the self reference so the structure may be freed. */
- free_jseg(jseg);
+ rele_jseg(jseg);
}
/*
@@ -3009,11 +3063,16 @@ handle_written_jseg(jseg, bp)
return;
/* Iterate through available jsegs processing their entries. */
do {
+ jblocks->jb_oldestwrseq = jseg->js_oldseq;
jsegn = TAILQ_NEXT(jseg, js_next);
complete_jseg(jseg);
jseg = jsegn;
} while (jseg && jseg->js_state & DEPCOMPLETE);
jblocks->jb_writeseg = jseg;
+ /*
+ * Attempt to free jsegs now that oldestwrseq may have advanced.
+ */
+ free_jsegs(jblocks);
}
static inline struct jsegdep *
@@ -3682,6 +3741,8 @@ cancel_jnewblk(jnewblk, wkhd)
struct jsegdep *jsegdep;
jsegdep = jnewblk->jn_jsegdep;
+ if (jnewblk->jn_jsegdep == NULL || jnewblk->jn_dep == NULL)
+ panic("cancel_jnewblk: Invalid state");
jnewblk->jn_jsegdep = NULL;
jnewblk->jn_dep = NULL;
jnewblk->jn_state |= GOINGAWAY;
@@ -3709,34 +3770,97 @@ free_jfreeblk(jfreeblk)
}
/*
- * Release one reference to a jseg and free it if the count reaches 0. This
- * should eventually reclaim journal space as well.
+ * Free a single jseg once it is no longer referenced in memory or on
+ * disk. Reclaim journal blocks and dependencies waiting for the segment
+ * to disappear.
*/
static void
-free_jseg(jseg)
+free_jseg(jseg, jblocks)
struct jseg *jseg;
+ struct jblocks *jblocks;
{
+ struct freework *freework;
+
+ /*
+ * Free freework structures that were lingering to indicate freed
+ * indirect blocks that forced journal write ordering on reallocate.
+ */
+ while ((freework = LIST_FIRST(&jseg->js_indirs)) != NULL) {
+ LIST_REMOVE(freework, fw_next);
+ WORKLIST_REMOVE(&freework->fw_list);
+ WORKITEM_FREE(freework, D_FREEWORK);
+ }
+ if (jblocks->jb_oldestseg == jseg)
+ jblocks->jb_oldestseg = TAILQ_NEXT(jseg, js_next);
+ TAILQ_REMOVE(&jblocks->jb_segs, jseg, js_next);
+ jblocks_free(jblocks, jseg->js_list.wk_mp, jseg->js_size);
+ KASSERT(LIST_EMPTY(&jseg->js_entries),
+ ("free_jseg: Freed jseg has valid entries."));
+ WORKITEM_FREE(jseg, D_JSEG);
+}
+
+/*
+ * Free all jsegs that meet the criteria for being reclaimed and update
+ * oldestseg.
+ */
+static void
+free_jsegs(jblocks)
struct jblocks *jblocks;
+{
+ struct jseg *jseg;
- KASSERT(jseg->js_refs > 0,
- ("free_jseg: Invalid refcnt %d", jseg->js_refs));
- if (--jseg->js_refs != 0)
- return;
/*
* Free only those jsegs which have none allocated before them to
* preserve the journal space ordering.
*/
- jblocks = jseg->js_jblocks;
while ((jseg = TAILQ_FIRST(&jblocks->jb_segs)) != NULL) {
- jblocks->jb_oldestseq = jseg->js_seq;
- if (jseg->js_refs != 0)
+ /*
+ * Only reclaim space when nothing depends on this journal
+ * set and another set has written that it is no longer
+ * valid.
+ */
+ if (jseg->js_refs != 0) {
+ jblocks->jb_oldestseg = jseg;
+ return;
+ }
+ if (!LIST_EMPTY(&jseg->js_indirs) &&
+ jseg->js_seq >= jblocks->jb_oldestwrseq)
break;
- TAILQ_REMOVE(&jblocks->jb_segs, jseg, js_next);
- jblocks_free(jblocks, jseg->js_list.wk_mp, jseg->js_size);
- KASSERT(LIST_EMPTY(&jseg->js_entries),
- ("free_jseg: Freed jseg has valid entries."));
- WORKITEM_FREE(jseg, D_JSEG);
+ free_jseg(jseg, jblocks);
}
+ /*
+ * If we exited the loop above we still must discover the
+ * oldest valid segment.
+ */
+ if (jseg)
+ for (jseg = jblocks->jb_oldestseg; jseg != NULL;
+ jseg = TAILQ_NEXT(jseg, js_next))
+ if (jseg->js_refs != 0)
+ break;
+ jblocks->jb_oldestseg = jseg;
+ /*
+ * The journal has no valid records but some jsegs may still be
+ * waiting on oldestwrseq to advance. We force a small record
+ * out to permit these lingering records to be reclaimed.
+ */
+ if (jblocks->jb_oldestseg == NULL && !TAILQ_EMPTY(&jblocks->jb_segs))
+ jblocks->jb_needseg = 1;
+}
+
+/*
+ * Release one reference to a jseg and free it if the count reaches 0. This
+ * should eventually reclaim journal space as well.
+ */
+static void
+rele_jseg(jseg)
+ struct jseg *jseg;
+{
+
+ KASSERT(jseg->js_refs > 0,
+ ("free_jseg: Invalid refcnt %d", jseg->js_refs));
+ if (--jseg->js_refs != 0)
+ return;
+ free_jsegs(jseg->js_jblocks);
}
/*
@@ -3748,7 +3872,7 @@ free_jsegdep(jsegdep)
{
if (jsegdep->jd_seg)
- free_jseg(jsegdep->jd_seg);
+ rele_jseg(jsegdep->jd_seg);
WORKITEM_FREE(jsegdep, D_JSEGDEP);
}
@@ -3769,7 +3893,7 @@ jwait(wk)
* this point. The caller may call back in and re-issue the request.
*/
if ((wk->wk_state & IOSTARTED) == 0) {
- softdep_process_journal(wk->wk_mp, MNT_WAIT);
+ softdep_process_journal(wk->wk_mp, wk, MNT_WAIT);
return;
}
wk->wk_state |= IOWAITING;
@@ -6004,7 +6128,9 @@ freework_freeblock(freework)
LIST_SWAP(&wkhd, &freework->fw_jwork, worklist, wk_list);
jnewblk = freework->fw_jnewblk;
if (jnewblk != NULL) {
- cancel_jnewblk(jnewblk, &wkhd);
+ /* Could've already been canceled in indir_trunc(). */
+ if ((jnewblk->jn_state & GOINGAWAY) == 0)
+ cancel_jnewblk(jnewblk, &wkhd);
needj = 0;
} else if (needj)
WORKLIST_INSERT(&wkhd, &freework->fw_list);
@@ -6068,16 +6194,40 @@ handle_written_freework(freework)
{
struct freeblks *freeblks;
struct freework *parent;
+ struct jsegdep *jsegdep;
+ struct worklist *wk;
+ int needj;
+ needj = 0;
freeblks = freework->fw_freeblks;
parent = freework->fw_parent;
+ /*
+ * SUJ needs to wait for the segment referencing freed indirect
+ * blocks to expire so that we know the checker will not confuse
+ * a re-allocated indirect block with its old contents.
+ */
+ if (freework->fw_lbn <= -NDADDR &&
+ freework->fw_list.wk_mp->mnt_kern_flag & MNTK_SUJ) {
+ LIST_FOREACH(wk, &freeblks->fb_jwork, wk_list)
+ if (wk->wk_type == D_JSEGDEP)
+ break;
+ if (wk) {
+ jsegdep = WK_JSEGDEP(wk);
+ LIST_INSERT_HEAD(&jsegdep->jd_seg->js_indirs,
+ freework, fw_next);
+ WORKLIST_INSERT(INDIR_HASH(freework->fw_list.wk_mp,
+ freework->fw_blkno), &freework->fw_list);
+ needj = 1;
+ }
+ }
if (parent) {
if (--parent->fw_ref != 0)
parent = NULL;
freeblks = NULL;
} else if (--freeblks->fb_ref != 0)
freeblks = NULL;
- WORKITEM_FREE(freework, D_FREEWORK);
+ if (needj == 0)
+ WORKITEM_FREE(freework, D_FREEWORK);
/*
* Don't delay these block frees or it takes an intolerable amount
* of time to process truncates and free their journal entries.
@@ -6251,6 +6401,10 @@ indir_trunc(freework, dbn, lbn)
LIST_SWAP(&wkhd, &indirdep->ir_jwork, worklist, wk_list);
LIST_FOREACH_SAFE(jnewblk, &indirdep->ir_jnewblkhd,
jn_indirdeps, jnewblkn) {
+ /*
+ * XXX This cancel may cause some lengthy delay
+ * before the record is reclaimed below.
+ */
LIST_REMOVE(jnewblk, jn_indirdeps);
cancel_jnewblk(jnewblk, &wkhd);
}
@@ -8165,13 +8319,15 @@ softdep_disk_io_initiation(bp)
case D_ALLOCINDIR:
/*
* We have to wait for the jnewblk to be journaled
- * before we can write to a block otherwise the
- * contents may be confused with an earlier file
+ * before we can write to a block if the contents
+ * may be confused with an earlier file's indirect
* at recovery time. Handle the marker as described
* above.
*/
newblk = WK_NEWBLK(wk);
- if (newblk->nb_jnewblk != NULL) {
+ if (newblk->nb_jnewblk != NULL &&
+ indirblk_inseg(newblk->nb_list.wk_mp,
+ newblk->nb_newblkno)) {
LIST_REMOVE(&marker, wk_list);
LIST_INSERT_BEFORE(wk, &marker, wk_list);
stat_jwait_newblk++;
diff --git a/sys/ufs/ffs/softdep.h b/sys/ufs/ffs/softdep.h
index 0f977a8..d864a4a 100644
--- a/sys/ufs/ffs/softdep.h
+++ b/sys/ufs/ffs/softdep.h
@@ -538,7 +538,7 @@ struct freeblks {
struct freework {
struct worklist fw_list; /* Delayed worklist. */
# define fw_state fw_list.wk_state
- LIST_ENTRY(freework) fw_next; /* Queue for freeblk list. */
+ LIST_ENTRY(freework) fw_next; /* For seg journal list. */
struct jnewblk *fw_jnewblk; /* Journal entry to cancel. */
struct freeblks *fw_freeblks; /* Root of operation. */
struct freework *fw_parent; /* Parent indirect. */
@@ -888,10 +888,12 @@ struct jseg {
struct worklist js_list; /* b_deps link for journal */
# define js_state js_list.wk_state
struct workhead js_entries; /* Entries awaiting write */
+ LIST_HEAD(, freework) js_indirs;/* List of indirects in this seg. */
TAILQ_ENTRY(jseg) js_next; /* List of all unfinished segments. */
struct jblocks *js_jblocks; /* Back pointer to block/seg list */
struct buf *js_buf; /* Buffer while unwritten */
uint64_t js_seq; /* Journal record sequence number. */
+ uint64_t js_oldseq; /* Oldest valid sequence number. */
int js_size; /* Size of journal record in bytes. */
int js_cnt; /* Total items allocated. */
int js_refs; /* Count of js_entries items. */
OpenPOWER on IntegriCloud