From 2db938bee32e7469ca8ed9bfb3a05535f28c680d Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 21 Feb 2011 17:25:37 +0100 Subject: jbd: Refine commit writeout logic Currently we write out all journal buffers in WRITE_SYNC mode. This improves performance for fsync heavy workloads but hinders performance when writes are mostly asynchronous, most noticably it slows down readers and users complain about slow desktop response etc. So submit writes as asynchronous in the normal case and only submit writes as WRITE_SYNC if we detect someone is waiting for current transaction commit. I've gathered some numbers to back this change. The first is the read latency test. It measures time to read 1 MB after several seconds of sleeping in presence of streaming writes. Top 10 times (out of 90) in us: Before After 2131586 697473 1709932 557487 1564598 535642 1480462 347573 1478579 323153 1408496 222181 1388960 181273 1329565 181070 1252486 172832 1223265 172278 Average: 619377 82180 So the improvement in both maximum and average latency is massive. I've measured fsync throughput by: fs_mark -n 100 -t 1 -s 16384 -d /mnt/fsync/ -S 1 -L 4 in presence of streaming reader. The numbers (fsyncs/s) are: Before After 9.9 6.3 6.8 6.0 6.3 6.2 5.8 6.1 So fsync performance seems unharmed by this change. Signed-off-by: Jan Kara --- fs/jbd/journal.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/jbd/journal.c') diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 0971e92..2047fd7 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -563,6 +563,8 @@ int log_wait_commit(journal_t *journal, tid_t tid) spin_unlock(&journal->j_state_lock); #endif spin_lock(&journal->j_state_lock); + if (!tid_geq(journal->j_commit_waited, tid)) + journal->j_commit_waited = tid; while (tid_gt(tid, journal->j_commit_sequence)) { jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n", tid, journal->j_commit_sequence); -- cgit v1.1 From 9754e39c7bc51328f145e933bfb0df47cd67b6e9 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sat, 7 Apr 2012 12:33:03 +0200 Subject: jbd: Split updating of journal superblock and marking journal empty There are three case of updating journal superblock. In the first case, we want to mark journal as empty (setting s_sequence to 0), in the second case we want to update log tail, in the third case we want to update s_errno. Split these cases into separate functions. It makes the code slightly more straightforward and later patches will make the distinction even more important. Signed-off-by: Jan Kara --- fs/jbd/journal.c | 164 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 97 insertions(+), 67 deletions(-) (limited to 'fs/jbd/journal.c') diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 2047fd7..44c104a 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -923,8 +923,22 @@ static int journal_reset(journal_t *journal) journal->j_max_transaction_buffers = journal->j_maxlen / 4; - /* Add the dynamic fields and write it to disk. */ - journal_update_superblock(journal, 1); + /* + * As a special case, if the on-disk copy is already marked as needing + * no recovery (s_start == 0), then we can safely defer the superblock + * update until the next commit by setting JFS_FLUSHED. This avoids + * attempting a write to a potential-readonly device. + */ + if (sb->s_start == 0) { + jbd_debug(1,"JBD: Skipping superblock update on recovered sb " + "(start %u, seq %d, errno %d)\n", + journal->j_tail, journal->j_tail_sequence, + journal->j_errno); + journal->j_flags |= JFS_FLUSHED; + } else { + /* Add the dynamic fields and write it to disk. */ + journal_update_sb_log_tail(journal); + } return journal_start_thread(journal); } @@ -1001,35 +1015,11 @@ int journal_create(journal_t *journal) return journal_reset(journal); } -/** - * void journal_update_superblock() - Update journal sb on disk. - * @journal: The journal to update. - * @wait: Set to '0' if you don't want to wait for IO completion. - * - * Update a journal's dynamic superblock fields and write it to disk, - * optionally waiting for the IO to complete. - */ -void journal_update_superblock(journal_t *journal, int wait) +static void journal_write_superblock(journal_t *journal) { - journal_superblock_t *sb = journal->j_superblock; struct buffer_head *bh = journal->j_sb_buffer; - /* - * As a special case, if the on-disk copy is already marked as needing - * no recovery (s_start == 0) and there are no outstanding transactions - * in the filesystem, then we can safely defer the superblock update - * until the next commit by setting JFS_FLUSHED. This avoids - * attempting a write to a potential-readonly device. - */ - if (sb->s_start == 0 && journal->j_tail_sequence == - journal->j_transaction_sequence) { - jbd_debug(1,"JBD: Skipping superblock update on recovered sb " - "(start %u, seq %d, errno %d)\n", - journal->j_tail, journal->j_tail_sequence, - journal->j_errno); - goto out; - } - + trace_journal_write_superblock(journal); if (buffer_write_io_error(bh)) { char b[BDEVNAME_SIZE]; /* @@ -1047,44 +1037,94 @@ void journal_update_superblock(journal_t *journal, int wait) set_buffer_uptodate(bh); } + BUFFER_TRACE(bh, "marking dirty"); + mark_buffer_dirty(bh); + sync_dirty_buffer(bh); + if (buffer_write_io_error(bh)) { + char b[BDEVNAME_SIZE]; + printk(KERN_ERR "JBD: I/O error detected " + "when updating journal superblock for %s.\n", + journal_dev_name(journal, b)); + clear_buffer_write_io_error(bh); + set_buffer_uptodate(bh); + } +} + +/** + * journal_update_sb_log_tail() - Update log tail in journal sb on disk. + * @journal: The journal to update. + * + * Update a journal's superblock information about log tail and write it to + * disk, waiting for the IO to complete. + */ +void journal_update_sb_log_tail(journal_t *journal) +{ + journal_superblock_t *sb = journal->j_superblock; + spin_lock(&journal->j_state_lock); jbd_debug(1,"JBD: updating superblock (start %u, seq %d, errno %d)\n", journal->j_tail, journal->j_tail_sequence, journal->j_errno); sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); sb->s_start = cpu_to_be32(journal->j_tail); - sb->s_errno = cpu_to_be32(journal->j_errno); spin_unlock(&journal->j_state_lock); - BUFFER_TRACE(bh, "marking dirty"); - mark_buffer_dirty(bh); - if (wait) { - sync_dirty_buffer(bh); - if (buffer_write_io_error(bh)) { - char b[BDEVNAME_SIZE]; - printk(KERN_ERR "JBD: I/O error detected " - "when updating journal superblock for %s.\n", - journal_dev_name(journal, b)); - clear_buffer_write_io_error(bh); - set_buffer_uptodate(bh); - } - } else - write_dirty_buffer(bh, WRITE); + journal_write_superblock(journal); - trace_jbd_update_superblock_end(journal, wait); -out: - /* If we have just flushed the log (by marking s_start==0), then - * any future commit will have to be careful to update the - * superblock again to re-record the true start of the log. */ + /* Log is no longer empty */ + spin_lock(&journal->j_state_lock); + WARN_ON(!sb->s_sequence); + journal->j_flags &= ~JFS_FLUSHED; + spin_unlock(&journal->j_state_lock); +} + +/** + * mark_journal_empty() - Mark on disk journal as empty. + * @journal: The journal to update. + * + * Update a journal's dynamic superblock fields to show that journal is empty. + * Write updated superblock to disk waiting for IO to complete. + */ +static void mark_journal_empty(journal_t *journal) +{ + journal_superblock_t *sb = journal->j_superblock; spin_lock(&journal->j_state_lock); - if (sb->s_start) - journal->j_flags &= ~JFS_FLUSHED; - else - journal->j_flags |= JFS_FLUSHED; + jbd_debug(1, "JBD: Marking journal as empty (seq %d)\n", + journal->j_tail_sequence); + + sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); + sb->s_start = cpu_to_be32(0); + spin_unlock(&journal->j_state_lock); + + journal_write_superblock(journal); + + spin_lock(&journal->j_state_lock); + /* Log is empty */ + journal->j_flags |= JFS_FLUSHED; spin_unlock(&journal->j_state_lock); } +/** + * journal_update_sb_errno() - Update error in the journal. + * @journal: The journal to update. + * + * Update a journal's errno. Write updated superblock to disk waiting for IO + * to complete. + */ +static void journal_update_sb_errno(journal_t *journal) +{ + journal_superblock_t *sb = journal->j_superblock; + + spin_lock(&journal->j_state_lock); + jbd_debug(1, "JBD: updating superblock error (errno %d)\n", + journal->j_errno); + sb->s_errno = cpu_to_be32(journal->j_errno); + spin_unlock(&journal->j_state_lock); + + journal_write_superblock(journal); +} + /* * Read the superblock for a given journal, performing initial * validation of the format. @@ -1268,14 +1308,11 @@ int journal_destroy(journal_t *journal) if (journal->j_sb_buffer) { if (!is_journal_aborted(journal)) { - /* We can now mark the journal as empty. */ - journal->j_tail = 0; journal->j_tail_sequence = ++journal->j_transaction_sequence; - journal_update_superblock(journal, 1); - } else { + mark_journal_empty(journal); + } else err = -EIO; - } brelse(journal->j_sb_buffer); } @@ -1457,7 +1494,6 @@ int journal_flush(journal_t *journal) { int err = 0; transaction_t *transaction = NULL; - unsigned int old_tail; spin_lock(&journal->j_state_lock); @@ -1499,14 +1535,8 @@ int journal_flush(journal_t *journal) * the magic code for a fully-recovered superblock. Any future * commits of data to the journal will restore the current * s_start value. */ + mark_journal_empty(journal); spin_lock(&journal->j_state_lock); - old_tail = journal->j_tail; - journal->j_tail = 0; - spin_unlock(&journal->j_state_lock); - journal_update_superblock(journal, 1); - spin_lock(&journal->j_state_lock); - journal->j_tail = old_tail; - J_ASSERT(!journal->j_running_transaction); J_ASSERT(!journal->j_committing_transaction); J_ASSERT(!journal->j_checkpoint_transactions); @@ -1547,7 +1577,7 @@ int journal_wipe(journal_t *journal, int write) err = journal_skip_recovery(journal); if (write) - journal_update_superblock(journal, 1); + mark_journal_empty(journal); no_recovery: return err; @@ -1615,7 +1645,7 @@ static void __journal_abort_soft (journal_t *journal, int errno) __journal_abort_hard(journal); if (errno) - journal_update_superblock(journal, 1); + journal_update_sb_errno(journal); } /** -- cgit v1.1 From 1ce8486dcc00c1e095af8d155fa4451936b89013 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sat, 7 Apr 2012 12:50:13 +0200 Subject: jbd: protect all log tail updates with j_checkpoint_mutex There are some log tail updates that are not protected by j_checkpoint_mutex. Some of these are harmless because they happen during startup or shutdown but updates in journal_commit_transaction() and journal_flush() can really race with other log tail updates (e.g. someone doing journal_flush() with someone running cleanup_journal_tail()). So protect all log tail updates with j_checkpoint_mutex. Signed-off-by: Jan Kara --- fs/jbd/journal.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'fs/jbd/journal.c') diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 44c104a..b29c767 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -936,8 +936,11 @@ static int journal_reset(journal_t *journal) journal->j_errno); journal->j_flags |= JFS_FLUSHED; } else { + /* Lock here to make assertions happy... */ + mutex_lock(&journal->j_checkpoint_mutex); /* Add the dynamic fields and write it to disk. */ journal_update_sb_log_tail(journal); + mutex_unlock(&journal->j_checkpoint_mutex); } return journal_start_thread(journal); } @@ -1061,6 +1064,7 @@ void journal_update_sb_log_tail(journal_t *journal) { journal_superblock_t *sb = journal->j_superblock; + BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); spin_lock(&journal->j_state_lock); jbd_debug(1,"JBD: updating superblock (start %u, seq %d, errno %d)\n", journal->j_tail, journal->j_tail_sequence, journal->j_errno); @@ -1089,6 +1093,7 @@ static void mark_journal_empty(journal_t *journal) { journal_superblock_t *sb = journal->j_superblock; + BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); spin_lock(&journal->j_state_lock); jbd_debug(1, "JBD: Marking journal as empty (seq %d)\n", journal->j_tail_sequence); @@ -1293,6 +1298,8 @@ int journal_destroy(journal_t *journal) /* Force any old transactions to disk */ + /* We cannot race with anybody but must keep assertions happy */ + mutex_lock(&journal->j_checkpoint_mutex); /* Totally anal locking here... */ spin_lock(&journal->j_list_lock); while (journal->j_checkpoint_transactions != NULL) { @@ -1315,6 +1322,7 @@ int journal_destroy(journal_t *journal) err = -EIO; brelse(journal->j_sb_buffer); } + mutex_unlock(&journal->j_checkpoint_mutex); if (journal->j_inode) iput(journal->j_inode); @@ -1528,6 +1536,7 @@ int journal_flush(journal_t *journal) if (is_journal_aborted(journal)) return -EIO; + mutex_lock(&journal->j_checkpoint_mutex); cleanup_journal_tail(journal); /* Finally, mark the journal as really needing no recovery. @@ -1536,6 +1545,7 @@ int journal_flush(journal_t *journal) * commits of data to the journal will restore the current * s_start value. */ mark_journal_empty(journal); + mutex_unlock(&journal->j_checkpoint_mutex); spin_lock(&journal->j_state_lock); J_ASSERT(!journal->j_running_transaction); J_ASSERT(!journal->j_committing_transaction); @@ -1576,8 +1586,12 @@ int journal_wipe(journal_t *journal, int write) write ? "Clearing" : "Ignoring"); err = journal_skip_recovery(journal); - if (write) + if (write) { + /* Lock to make assertions happy... */ + mutex_lock(&journal->j_checkpoint_mutex); mark_journal_empty(journal); + mutex_unlock(&journal->j_checkpoint_mutex); + } no_recovery: return err; -- cgit v1.1 From fd2cbd4dfa3db477dd6226d387d3f1911d36a6a9 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sat, 7 Apr 2012 11:05:19 +0200 Subject: jbd: Write journal superblock with WRITE_FUA after checkpointing If journal superblock is written only in disk's caches and other transaction starts reusing space of the transaction cleaned from the log, it can happen blocks of a new transaction reach the disk before journal superblock. When power failure happens in such case, subsequent journal replay would still try to replay the old transaction but some of it's blocks may be already overwritten by the new transaction. For this reason we must use WRITE_FUA when updating log tail and we must first write new log tail to disk and update in-memory information only after that. Signed-off-by: Jan Kara --- fs/jbd/journal.c | 60 ++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 39 insertions(+), 21 deletions(-) (limited to 'fs/jbd/journal.c') diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index b29c767..425c2f2 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -938,8 +938,16 @@ static int journal_reset(journal_t *journal) } else { /* Lock here to make assertions happy... */ mutex_lock(&journal->j_checkpoint_mutex); - /* Add the dynamic fields and write it to disk. */ - journal_update_sb_log_tail(journal); + /* + * Update log tail information. We use WRITE_FUA since new + * transaction will start reusing journal space and so we + * must make sure information about current log tail is on + * disk before that. + */ + journal_update_sb_log_tail(journal, + journal->j_tail_sequence, + journal->j_tail, + WRITE_FUA); mutex_unlock(&journal->j_checkpoint_mutex); } return journal_start_thread(journal); @@ -1018,11 +1026,15 @@ int journal_create(journal_t *journal) return journal_reset(journal); } -static void journal_write_superblock(journal_t *journal) +static void journal_write_superblock(journal_t *journal, int write_op) { struct buffer_head *bh = journal->j_sb_buffer; + int ret; - trace_journal_write_superblock(journal); + trace_journal_write_superblock(journal, write_op); + if (!(journal->j_flags & JFS_BARRIER)) + write_op &= ~(REQ_FUA | REQ_FLUSH); + lock_buffer(bh); if (buffer_write_io_error(bh)) { char b[BDEVNAME_SIZE]; /* @@ -1040,40 +1052,46 @@ static void journal_write_superblock(journal_t *journal) set_buffer_uptodate(bh); } - BUFFER_TRACE(bh, "marking dirty"); - mark_buffer_dirty(bh); - sync_dirty_buffer(bh); + get_bh(bh); + bh->b_end_io = end_buffer_write_sync; + ret = submit_bh(write_op, bh); + wait_on_buffer(bh); if (buffer_write_io_error(bh)) { - char b[BDEVNAME_SIZE]; - printk(KERN_ERR "JBD: I/O error detected " - "when updating journal superblock for %s.\n", - journal_dev_name(journal, b)); clear_buffer_write_io_error(bh); set_buffer_uptodate(bh); + ret = -EIO; + } + if (ret) { + char b[BDEVNAME_SIZE]; + printk(KERN_ERR "JBD: Error %d detected " + "when updating journal superblock for %s.\n", + ret, journal_dev_name(journal, b)); } } /** * journal_update_sb_log_tail() - Update log tail in journal sb on disk. * @journal: The journal to update. + * @tail_tid: TID of the new transaction at the tail of the log + * @tail_block: The first block of the transaction at the tail of the log + * @write_op: With which operation should we write the journal sb * * Update a journal's superblock information about log tail and write it to * disk, waiting for the IO to complete. */ -void journal_update_sb_log_tail(journal_t *journal) +void journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, + unsigned int tail_block, int write_op) { journal_superblock_t *sb = journal->j_superblock; BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); - spin_lock(&journal->j_state_lock); - jbd_debug(1,"JBD: updating superblock (start %u, seq %d, errno %d)\n", - journal->j_tail, journal->j_tail_sequence, journal->j_errno); + jbd_debug(1,"JBD: updating superblock (start %u, seq %u)\n", + tail_block, tail_tid); - sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); - sb->s_start = cpu_to_be32(journal->j_tail); - spin_unlock(&journal->j_state_lock); + sb->s_sequence = cpu_to_be32(tail_tid); + sb->s_start = cpu_to_be32(tail_block); - journal_write_superblock(journal); + journal_write_superblock(journal, write_op); /* Log is no longer empty */ spin_lock(&journal->j_state_lock); @@ -1102,7 +1120,7 @@ static void mark_journal_empty(journal_t *journal) sb->s_start = cpu_to_be32(0); spin_unlock(&journal->j_state_lock); - journal_write_superblock(journal); + journal_write_superblock(journal, WRITE_FUA); spin_lock(&journal->j_state_lock); /* Log is empty */ @@ -1127,7 +1145,7 @@ static void journal_update_sb_errno(journal_t *journal) sb->s_errno = cpu_to_be32(journal->j_errno); spin_unlock(&journal->j_state_lock); - journal_write_superblock(journal); + journal_write_superblock(journal, WRITE_SYNC); } /* -- cgit v1.1