summaryrefslogtreecommitdiffstats
path: root/fs/fs-writeback.c
diff options
context:
space:
mode:
authorTheodore Ts'o <tytso@mit.edu>2015-02-02 00:37:00 -0500
committerAl Viro <viro@zeniv.linux.org.uk>2015-02-05 02:45:00 -0500
commit0ae45f63d4ef8d8eeec49c7d8b44a1775fff13e8 (patch)
tree660dbb014482092361eab263847fb906b5a9ec22 /fs/fs-writeback.c
parente36f014edff70fc02b3d3d79cead1d58f289332e (diff)
downloadop-kernel-dev-0ae45f63d4ef8d8eeec49c7d8b44a1775fff13e8.zip
op-kernel-dev-0ae45f63d4ef8d8eeec49c7d8b44a1775fff13e8.tar.gz
vfs: add support for a lazytime mount option
Add a new mount option which enables a new "lazytime" mode. This mode causes atime, mtime, and ctime updates to only be made to the in-memory version of the inode. The on-disk times will only get updated when (a) if the inode needs to be updated for some non-time related change, (b) if userspace calls fsync(), syncfs() or sync(), or (c) just before an undeleted inode is evicted from memory. This is OK according to POSIX because there are no guarantees after a crash unless userspace explicitly requests via a fsync(2) call. For workloads which feature a large number of random write to a preallocated file, the lazytime mount option significantly reduces writes to the inode table. The repeated 4k writes to a single block will result in undesirable stress on flash devices and SMR disk drives. Even on conventional HDD's, the repeated writes to the inode table block will trigger Adjacent Track Interference (ATI) remediation latencies, which very negatively impact long tail latencies --- which is a very big deal for web serving tiers (for example). Google-Bug-Id: 18297052 Signed-off-by: Theodore Ts'o <tytso@mit.edu> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Diffstat (limited to 'fs/fs-writeback.c')
-rw-r--r--fs/fs-writeback.c62
1 files changed, 51 insertions, 11 deletions
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 2d609a5..0046861 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -247,14 +247,19 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
return ret;
}
+#define EXPIRE_DIRTY_ATIME 0x0001
+
/*
* Move expired (dirtied before work->older_than_this) dirty inodes from
* @delaying_queue to @dispatch_queue.
*/
static int move_expired_inodes(struct list_head *delaying_queue,
struct list_head *dispatch_queue,
+ int flags,
struct wb_writeback_work *work)
{
+ unsigned long *older_than_this = NULL;
+ unsigned long expire_time;
LIST_HEAD(tmp);
struct list_head *pos, *node;
struct super_block *sb = NULL;
@@ -262,13 +267,21 @@ static int move_expired_inodes(struct list_head *delaying_queue,
int do_sb_sort = 0;
int moved = 0;
+ if ((flags & EXPIRE_DIRTY_ATIME) == 0)
+ older_than_this = work->older_than_this;
+ else if ((work->reason == WB_REASON_SYNC) == 0) {
+ expire_time = jiffies - (HZ * 86400);
+ older_than_this = &expire_time;
+ }
while (!list_empty(delaying_queue)) {
inode = wb_inode(delaying_queue->prev);
- if (work->older_than_this &&
- inode_dirtied_after(inode, *work->older_than_this))
+ if (older_than_this &&
+ inode_dirtied_after(inode, *older_than_this))
break;
list_move(&inode->i_wb_list, &tmp);
moved++;
+ if (flags & EXPIRE_DIRTY_ATIME)
+ set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state);
if (sb_is_blkdev_sb(inode->i_sb))
continue;
if (sb && sb != inode->i_sb)
@@ -309,9 +322,12 @@ out:
static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
{
int moved;
+
assert_spin_locked(&wb->list_lock);
list_splice_init(&wb->b_more_io, &wb->b_io);
- moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, work);
+ moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work);
+ moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
+ EXPIRE_DIRTY_ATIME, work);
trace_writeback_queue_io(wb, work, moved);
}
@@ -435,6 +451,8 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
* updates after data IO completion.
*/
redirty_tail(inode, wb);
+ } else if (inode->i_state & I_DIRTY_TIME) {
+ list_move(&inode->i_wb_list, &wb->b_dirty_time);
} else {
/* The inode is clean. Remove from writeback lists. */
list_del_init(&inode->i_wb_list);
@@ -481,7 +499,13 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
spin_lock(&inode->i_lock);
dirty = inode->i_state & I_DIRTY;
- inode->i_state &= ~I_DIRTY;
+ if (((dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) &&
+ (inode->i_state & I_DIRTY_TIME)) ||
+ (inode->i_state & I_DIRTY_TIME_EXPIRED)) {
+ dirty |= I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED;
+ trace_writeback_lazytime(inode);
+ }
+ inode->i_state &= ~dirty;
/*
* Paired with smp_mb() in __mark_inode_dirty(). This allows
@@ -501,8 +525,10 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
spin_unlock(&inode->i_lock);
+ if (dirty & I_DIRTY_TIME)
+ mark_inode_dirty_sync(inode);
/* Don't write the inode if only I_DIRTY_PAGES was set */
- if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
+ if (dirty & ~I_DIRTY_PAGES) {
int err = write_inode(inode, wbc);
if (ret == 0)
ret = err;
@@ -550,7 +576,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
* make sure inode is on some writeback list and leave it there unless
* we have completely cleaned the inode.
*/
- if (!(inode->i_state & I_DIRTY) &&
+ if (!(inode->i_state & I_DIRTY_ALL) &&
(wbc->sync_mode != WB_SYNC_ALL ||
!mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
goto out;
@@ -565,7 +591,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
* If inode is clean, remove it from writeback lists. Otherwise don't
* touch it. See comment above for explanation.
*/
- if (!(inode->i_state & I_DIRTY))
+ if (!(inode->i_state & I_DIRTY_ALL))
list_del_init(&inode->i_wb_list);
spin_unlock(&wb->list_lock);
inode_sync_complete(inode);
@@ -707,7 +733,7 @@ static long writeback_sb_inodes(struct super_block *sb,
wrote += write_chunk - wbc.nr_to_write;
spin_lock(&wb->list_lock);
spin_lock(&inode->i_lock);
- if (!(inode->i_state & I_DIRTY))
+ if (!(inode->i_state & I_DIRTY_ALL))
wrote++;
requeue_inode(inode, wb, &wbc);
inode_sync_complete(inode);
@@ -1145,16 +1171,20 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode)
* page->mapping->host, so the page-dirtying time is recorded in the internal
* blockdev inode.
*/
+#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
void __mark_inode_dirty(struct inode *inode, int flags)
{
struct super_block *sb = inode->i_sb;
struct backing_dev_info *bdi = NULL;
+ int dirtytime;
+
+ trace_writeback_mark_inode_dirty(inode, flags);
/*
* Don't do this for I_DIRTY_PAGES - that doesn't actually
* dirty the inode itself
*/
- if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
+ if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_TIME)) {
trace_writeback_dirty_inode_start(inode, flags);
if (sb->s_op->dirty_inode)
@@ -1162,6 +1192,9 @@ void __mark_inode_dirty(struct inode *inode, int flags)
trace_writeback_dirty_inode(inode, flags);
}
+ if (flags & I_DIRTY_INODE)
+ flags &= ~I_DIRTY_TIME;
+ dirtytime = flags & I_DIRTY_TIME;
/*
* Paired with smp_mb() in __writeback_single_inode() for the
@@ -1169,16 +1202,21 @@ void __mark_inode_dirty(struct inode *inode, int flags)
*/
smp_mb();
- if ((inode->i_state & flags) == flags)
+ if (((inode->i_state & flags) == flags) ||
+ (dirtytime && (inode->i_state & I_DIRTY_INODE)))
return;
if (unlikely(block_dump))
block_dump___mark_inode_dirty(inode);
spin_lock(&inode->i_lock);
+ if (dirtytime && (inode->i_state & I_DIRTY_INODE))
+ goto out_unlock_inode;
if ((inode->i_state & flags) != flags) {
const int was_dirty = inode->i_state & I_DIRTY;
+ if (flags & I_DIRTY_INODE)
+ inode->i_state &= ~I_DIRTY_TIME;
inode->i_state |= flags;
/*
@@ -1225,8 +1263,10 @@ void __mark_inode_dirty(struct inode *inode, int flags)
}
inode->dirtied_when = jiffies;
- list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
+ list_move(&inode->i_wb_list, dirtytime ?
+ &bdi->wb.b_dirty_time : &bdi->wb.b_dirty);
spin_unlock(&bdi->wb.list_lock);
+ trace_writeback_dirty_inode_enqueue(inode);
if (wakeup_bdi)
bdi_wakeup_thread_delayed(bdi);
OpenPOWER on IntegriCloud