From 0ae45f63d4ef8d8eeec49c7d8b44a1775fff13e8 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 2 Feb 2015 00:37:00 -0500 Subject: vfs: add support for a lazytime mount option Add a new mount option which enables a new "lazytime" mode. This mode causes atime, mtime, and ctime updates to only be made to the in-memory version of the inode. The on-disk times will only get updated when (a) if the inode needs to be updated for some non-time related change, (b) if userspace calls fsync(), syncfs() or sync(), or (c) just before an undeleted inode is evicted from memory. This is OK according to POSIX because there are no guarantees after a crash unless userspace explicitly requests via a fsync(2) call. For workloads which feature a large number of random write to a preallocated file, the lazytime mount option significantly reduces writes to the inode table. The repeated 4k writes to a single block will result in undesirable stress on flash devices and SMR disk drives. Even on conventional HDD's, the repeated writes to the inode table block will trigger Adjacent Track Interference (ATI) remediation latencies, which very negatively impact long tail latencies --- which is a very big deal for web serving tiers (for example). Google-Bug-Id: 18297052 Signed-off-by: Theodore Ts'o Signed-off-by: Al Viro --- include/linux/backing-dev.h | 1 + include/linux/fs.h | 5 ++++ include/trace/events/writeback.h | 60 +++++++++++++++++++++++++++++++++++++++- include/uapi/linux/fs.h | 4 ++- 4 files changed, 68 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 5da6012..4cdf733 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -55,6 +55,7 @@ struct bdi_writeback { struct list_head b_dirty; /* dirty inodes */ struct list_head b_io; /* parked for writeback */ struct list_head b_more_io; /* parked for more writeback */ + struct list_head b_dirty_time; /* time stamps are dirty */ spinlock_t list_lock; /* protects the b_* lists */ }; diff --git a/include/linux/fs.h b/include/linux/fs.h index 42efe13..cd027ce 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1746,8 +1746,12 @@ struct super_operations { #define __I_DIO_WAKEUP 9 #define I_DIO_WAKEUP (1 << I_DIO_WAKEUP) #define I_LINKABLE (1 << 10) +#define I_DIRTY_TIME (1 << 11) +#define __I_DIRTY_TIME_EXPIRED 12 +#define I_DIRTY_TIME_EXPIRED (1 << __I_DIRTY_TIME_EXPIRED) #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES) +#define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME) extern void __mark_inode_dirty(struct inode *, int); static inline void mark_inode_dirty(struct inode *inode) @@ -1910,6 +1914,7 @@ extern int current_umask(void); extern void ihold(struct inode * inode); extern void iput(struct inode *); +extern int generic_update_time(struct inode *, struct timespec *, int); static inline struct inode *file_inode(const struct file *f) { diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index cee02d6..5ecb4c2 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -18,6 +18,8 @@ {I_FREEING, "I_FREEING"}, \ {I_CLEAR, "I_CLEAR"}, \ {I_SYNC, "I_SYNC"}, \ + {I_DIRTY_TIME, "I_DIRTY_TIME"}, \ + {I_DIRTY_TIME_EXPIRED, "I_DIRTY_TIME_EXPIRED"}, \ {I_REFERENCED, "I_REFERENCED"} \ ) @@ -68,6 +70,7 @@ DECLARE_EVENT_CLASS(writeback_dirty_inode_template, TP_STRUCT__entry ( __array(char, name, 32) __field(unsigned long, ino) + __field(unsigned long, state) __field(unsigned long, flags) ), @@ -78,16 +81,25 @@ DECLARE_EVENT_CLASS(writeback_dirty_inode_template, strncpy(__entry->name, bdi->dev ? dev_name(bdi->dev) : "(unknown)", 32); __entry->ino = inode->i_ino; + __entry->state = inode->i_state; __entry->flags = flags; ), - TP_printk("bdi %s: ino=%lu flags=%s", + TP_printk("bdi %s: ino=%lu state=%s flags=%s", __entry->name, __entry->ino, + show_inode_state(__entry->state), show_inode_state(__entry->flags) ) ); +DEFINE_EVENT(writeback_dirty_inode_template, writeback_mark_inode_dirty, + + TP_PROTO(struct inode *inode, int flags), + + TP_ARGS(inode, flags) +); + DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode_start, TP_PROTO(struct inode *inode, int flags), @@ -598,6 +610,52 @@ DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode, TP_ARGS(inode, wbc, nr_to_write) ); +DECLARE_EVENT_CLASS(writeback_lazytime_template, + TP_PROTO(struct inode *inode), + + TP_ARGS(inode), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field(unsigned long, ino ) + __field(unsigned long, state ) + __field( __u16, mode ) + __field(unsigned long, dirtied_when ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->state = inode->i_state; + __entry->mode = inode->i_mode; + __entry->dirtied_when = inode->dirtied_when; + ), + + TP_printk("dev %d,%d ino %lu dirtied %lu state %s mode 0%o", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, __entry->dirtied_when, + show_inode_state(__entry->state), __entry->mode) +); + +DEFINE_EVENT(writeback_lazytime_template, writeback_lazytime, + TP_PROTO(struct inode *inode), + + TP_ARGS(inode) +); + +DEFINE_EVENT(writeback_lazytime_template, writeback_lazytime_iput, + TP_PROTO(struct inode *inode), + + TP_ARGS(inode) +); + +DEFINE_EVENT(writeback_lazytime_template, writeback_dirty_inode_enqueue, + + TP_PROTO(struct inode *inode), + + TP_ARGS(inode) +); + #endif /* _TRACE_WRITEBACK_H */ /* This part must be outside protection */ diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 3735fa0..9b964a5 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -90,6 +90,7 @@ struct inodes_stat_t { #define MS_KERNMOUNT (1<<22) /* this is a kern_mount call */ #define MS_I_VERSION (1<<23) /* Update inode I_version field */ #define MS_STRICTATIME (1<<24) /* Always perform atime updates */ +#define MS_LAZYTIME (1<<25) /* Update the on-disk [acm]times lazily */ /* These sb flags are internal to the kernel */ #define MS_NOSEC (1<<28) @@ -100,7 +101,8 @@ struct inodes_stat_t { /* * Superblock flags that can be altered by MS_REMOUNT */ -#define MS_RMT_MASK (MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK|MS_I_VERSION) +#define MS_RMT_MASK (MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK|MS_I_VERSION|\ + MS_LAZYTIME) /* * Old magic mount flag and mask -- cgit v1.1 From fe032c422c5ba562ba9c2d316f55e258e03259c6 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 2 Feb 2015 00:37:01 -0500 Subject: vfs: add find_inode_nowait() function Add a new function find_inode_nowait() which is an even more general version of ilookup5_nowait(). It is designed for callers which need very fine grained control over when the function is allowed to block or increment the inode's reference count. Signed-off-by: Theodore Ts'o Signed-off-by: Al Viro --- include/linux/fs.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index cd027ce..5ea8b6e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2441,6 +2441,11 @@ extern struct inode *ilookup(struct super_block *sb, unsigned long ino); extern struct inode * iget5_locked(struct super_block *, unsigned long, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *); extern struct inode * iget_locked(struct super_block *, unsigned long); +extern struct inode *find_inode_nowait(struct super_block *, + unsigned long, + int (*match)(struct inode *, + unsigned long, void *), + void *data); extern int insert_inode_locked4(struct inode *, unsigned long, int (*test)(struct inode *, void *), void *); extern int insert_inode_locked(struct inode *); #ifdef CONFIG_DEBUG_LOCK_ALLOC -- cgit v1.1 From a26f49926da938f47561f386be56a83dd37a496d Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 2 Feb 2015 00:37:02 -0500 Subject: ext4: add optimization for the lazytime mount option Add an optimization for the MS_LAZYTIME mount option so that we will opportunistically write out any inodes with the I_DIRTY_TIME flag set in a particular inode table block when we need to update some inode in that inode table block anyway. Also add some temporary code so that we can set the lazytime mount option without needing a modified /sbin/mount program which can set MS_LAZYTIME. We can eventually make this go away once util-linux has added support. Google-Bug-Id: 18297052 Signed-off-by: Theodore Ts'o Signed-off-by: Al Viro --- include/trace/events/ext4.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'include') diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 6cfb841..6e5abd6 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -73,6 +73,36 @@ struct extent_status; { FALLOC_FL_ZERO_RANGE, "ZERO_RANGE"}) +TRACE_EVENT(ext4_other_inode_update_time, + TP_PROTO(struct inode *inode, ino_t orig_ino), + + TP_ARGS(inode, orig_ino), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( ino_t, orig_ino ) + __field( uid_t, uid ) + __field( gid_t, gid ) + __field( __u16, mode ) + ), + + TP_fast_assign( + __entry->orig_ino = orig_ino; + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->uid = i_uid_read(inode); + __entry->gid = i_gid_read(inode); + __entry->mode = inode->i_mode; + ), + + TP_printk("dev %d,%d orig_ino %lu ino %lu mode 0%o uid %u gid %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->orig_ino, + (unsigned long) __entry->ino, __entry->mode, + __entry->uid, __entry->gid) +); + TRACE_EVENT(ext4_free_inode, TP_PROTO(struct inode *inode), -- cgit v1.1