summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fs/ext4/extents.c93
-rw-r--r--fs/ext4/super.c1
-rw-r--r--include/trace/events/ext4.h82
3 files changed, 176 insertions, 0 deletions
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index c2ac06c..8b6a17b 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2919,12 +2919,23 @@ out:
* a> There is no split required: Entire extent should be initialized
* b> Splits in two extents: Write is happening at either end of the extent
* c> Splits in three extents: Somone is writing in middle of the extent
+ *
+ * Pre-conditions:
+ * - The extent pointed to by 'path' is uninitialized.
+ * - The extent pointed to by 'path' contains a superset
+ * of the logical span [map->m_lblk, map->m_lblk + map->m_len).
+ *
+ * Post-conditions on success:
+ * - the returned value is the number of blocks beyond map->l_lblk
+ * that are allocated and initialized.
+ * It is guaranteed to be >= map->m_len.
*/
static int ext4_ext_convert_to_initialized(handle_t *handle,
struct inode *inode,
struct ext4_map_blocks *map,
struct ext4_ext_path *path)
{
+ struct ext4_extent_header *eh;
struct ext4_map_blocks split_map;
struct ext4_extent zero_ex;
struct ext4_extent *ex;
@@ -2944,11 +2955,93 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
eof_block = map->m_lblk + map->m_len;
depth = ext_depth(inode);
+ eh = path[depth].p_hdr;
ex = path[depth].p_ext;
ee_block = le32_to_cpu(ex->ee_block);
ee_len = ext4_ext_get_actual_len(ex);
allocated = ee_len - (map->m_lblk - ee_block);
+ trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
+
+ /* Pre-conditions */
+ BUG_ON(!ext4_ext_is_uninitialized(ex));
+ BUG_ON(!in_range(map->m_lblk, ee_block, ee_len));
+ BUG_ON(map->m_lblk + map->m_len > ee_block + ee_len);
+
+ /*
+ * Attempt to transfer newly initialized blocks from the currently
+ * uninitialized extent to its left neighbor. This is much cheaper
+ * than an insertion followed by a merge as those involve costly
+ * memmove() calls. This is the common case in steady state for
+ * workloads doing fallocate(FALLOC_FL_KEEP_SIZE) followed by append
+ * writes.
+ *
+ * Limitations of the current logic:
+ * - L1: we only deal with writes at the start of the extent.
+ * The approach could be extended to writes at the end
+ * of the extent but this scenario was deemed less common.
+ * - L2: we do not deal with writes covering the whole extent.
+ * This would require removing the extent if the transfer
+ * is possible.
+ * - L3: we only attempt to merge with an extent stored in the
+ * same extent tree node.
+ */
+ if ((map->m_lblk == ee_block) && /*L1*/
+ (map->m_len < ee_len) && /*L2*/
+ (ex > EXT_FIRST_EXTENT(eh))) { /*L3*/
+ struct ext4_extent *prev_ex;
+ ext4_lblk_t prev_lblk;
+ ext4_fsblk_t prev_pblk, ee_pblk;
+ unsigned int prev_len, write_len;
+
+ prev_ex = ex - 1;
+ prev_lblk = le32_to_cpu(prev_ex->ee_block);
+ prev_len = ext4_ext_get_actual_len(prev_ex);
+ prev_pblk = ext4_ext_pblock(prev_ex);
+ ee_pblk = ext4_ext_pblock(ex);
+ write_len = map->m_len;
+
+ /*
+ * A transfer of blocks from 'ex' to 'prev_ex' is allowed
+ * upon those conditions:
+ * - C1: prev_ex is initialized,
+ * - C2: prev_ex is logically abutting ex,
+ * - C3: prev_ex is physically abutting ex,
+ * - C4: prev_ex can receive the additional blocks without
+ * overflowing the (initialized) length limit.
+ */
+ if ((!ext4_ext_is_uninitialized(prev_ex)) && /*C1*/
+ ((prev_lblk + prev_len) == ee_block) && /*C2*/
+ ((prev_pblk + prev_len) == ee_pblk) && /*C3*/
+ (prev_len < (EXT_INIT_MAX_LEN - write_len))) { /*C4*/
+ err = ext4_ext_get_access(handle, inode, path + depth);
+ if (err)
+ goto out;
+
+ trace_ext4_ext_convert_to_initialized_fastpath(inode,
+ map, ex, prev_ex);
+
+ /* Shift the start of ex by 'write_len' blocks */
+ ex->ee_block = cpu_to_le32(ee_block + write_len);
+ ext4_ext_store_pblock(ex, ee_pblk + write_len);
+ ex->ee_len = cpu_to_le16(ee_len - write_len);
+ ext4_ext_mark_uninitialized(ex); /* Restore the flag */
+
+ /* Extend prev_ex by 'write_len' blocks */
+ prev_ex->ee_len = cpu_to_le16(prev_len + write_len);
+
+ /* Mark the block containing both extents as dirty */
+ ext4_ext_dirty(handle, inode, path + depth);
+
+ /* Update path to point to the right extent */
+ path[depth].p_ext = prev_ex;
+
+ /* Result: number of initialized blocks past m_lblk */
+ allocated = write_len;
+ goto out;
+ }
+ }
+
WARN_ON(map->m_lblk < ee_block);
/*
* It is safe to convert extent to initialized via explicit
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index dcc4605..9953d80 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -45,6 +45,7 @@
#include <linux/freezer.h>
#include "ext4.h"
+#include "ext4_extents.h"
#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index c9a341e..748ff7c 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -9,6 +9,7 @@
struct ext4_allocation_context;
struct ext4_allocation_request;
+struct ext4_extent;
struct ext4_prealloc_space;
struct ext4_inode_info;
struct mpage_da_data;
@@ -1394,6 +1395,87 @@ DEFINE_EVENT(ext4__truncate, ext4_truncate_exit,
TP_ARGS(inode)
);
+/* 'ux' is the uninitialized extent. */
+TRACE_EVENT(ext4_ext_convert_to_initialized_enter,
+ TP_PROTO(struct inode *inode, struct ext4_map_blocks *map,
+ struct ext4_extent *ux),
+
+ TP_ARGS(inode, map, ux),
+
+ TP_STRUCT__entry(
+ __field( ino_t, ino )
+ __field( dev_t, dev )
+ __field( ext4_lblk_t, m_lblk )
+ __field( unsigned, m_len )
+ __field( ext4_lblk_t, u_lblk )
+ __field( unsigned, u_len )
+ __field( ext4_fsblk_t, u_pblk )
+ ),
+
+ TP_fast_assign(
+ __entry->ino = inode->i_ino;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->m_lblk = map->m_lblk;
+ __entry->m_len = map->m_len;
+ __entry->u_lblk = le32_to_cpu(ux->ee_block);
+ __entry->u_len = ext4_ext_get_actual_len(ux);
+ __entry->u_pblk = ext4_ext_pblock(ux);
+ ),
+
+ TP_printk("dev %d,%d ino %lu m_lblk %u m_len %u u_lblk %u u_len %u "
+ "u_pblk %llu",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long) __entry->ino,
+ __entry->m_lblk, __entry->m_len,
+ __entry->u_lblk, __entry->u_len, __entry->u_pblk)
+);
+
+/*
+ * 'ux' is the uninitialized extent.
+ * 'ix' is the initialized extent to which blocks are transferred.
+ */
+TRACE_EVENT(ext4_ext_convert_to_initialized_fastpath,
+ TP_PROTO(struct inode *inode, struct ext4_map_blocks *map,
+ struct ext4_extent *ux, struct ext4_extent *ix),
+
+ TP_ARGS(inode, map, ux, ix),
+
+ TP_STRUCT__entry(
+ __field( ino_t, ino )
+ __field( dev_t, dev )
+ __field( ext4_lblk_t, m_lblk )
+ __field( unsigned, m_len )
+ __field( ext4_lblk_t, u_lblk )
+ __field( unsigned, u_len )
+ __field( ext4_fsblk_t, u_pblk )
+ __field( ext4_lblk_t, i_lblk )
+ __field( unsigned, i_len )
+ __field( ext4_fsblk_t, i_pblk )
+ ),
+
+ TP_fast_assign(
+ __entry->ino = inode->i_ino;
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->m_lblk = map->m_lblk;
+ __entry->m_len = map->m_len;
+ __entry->u_lblk = le32_to_cpu(ux->ee_block);
+ __entry->u_len = ext4_ext_get_actual_len(ux);
+ __entry->u_pblk = ext4_ext_pblock(ux);
+ __entry->i_lblk = le32_to_cpu(ix->ee_block);
+ __entry->i_len = ext4_ext_get_actual_len(ix);
+ __entry->i_pblk = ext4_ext_pblock(ix);
+ ),
+
+ TP_printk("dev %d,%d ino %lu m_lblk %u m_len %u "
+ "u_lblk %u u_len %u u_pblk %llu "
+ "i_lblk %u i_len %u i_pblk %llu ",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long) __entry->ino,
+ __entry->m_lblk, __entry->m_len,
+ __entry->u_lblk, __entry->u_len, __entry->u_pblk,
+ __entry->i_lblk, __entry->i_len, __entry->i_pblk)
+);
+
DECLARE_EVENT_CLASS(ext4__map_blocks_enter,
TP_PROTO(struct inode *inode, ext4_lblk_t lblk,
unsigned int len, unsigned int flags),
OpenPOWER on IntegriCloud