summaryrefslogtreecommitdiffstats
path: root/fs/ext4/inode.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ext4/inode.c')
-rw-r--r--fs/ext4/inode.c505
1 files changed, 333 insertions, 172 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 03ba20b..a2e7952 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -47,8 +47,10 @@
static inline int ext4_begin_ordered_truncate(struct inode *inode,
loff_t new_size)
{
- return jbd2_journal_begin_ordered_truncate(&EXT4_I(inode)->jinode,
- new_size);
+ return jbd2_journal_begin_ordered_truncate(
+ EXT4_SB(inode->i_sb)->s_journal,
+ &EXT4_I(inode)->jinode,
+ new_size);
}
static void ext4_invalidatepage(struct page *page, unsigned long offset);
@@ -369,6 +371,34 @@ static int ext4_block_to_path(struct inode *inode,
return n;
}
+static int __ext4_check_blockref(const char *function, struct inode *inode,
+ unsigned int *p, unsigned int max) {
+
+ unsigned int maxblocks = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es);
+ unsigned int *bref = p;
+ while (bref < p+max) {
+ if (unlikely(*bref >= maxblocks)) {
+ ext4_error(inode->i_sb, function,
+ "block reference %u >= max (%u) "
+ "in inode #%lu, offset=%d",
+ *bref, maxblocks,
+ inode->i_ino, (int)(bref-p));
+ return -EIO;
+ }
+ bref++;
+ }
+ return 0;
+}
+
+
+#define ext4_check_indirect_blockref(inode, bh) \
+ __ext4_check_blockref(__func__, inode, (__le32 *)(bh)->b_data, \
+ EXT4_ADDR_PER_BLOCK((inode)->i_sb))
+
+#define ext4_check_inode_blockref(inode) \
+ __ext4_check_blockref(__func__, inode, EXT4_I(inode)->i_data, \
+ EXT4_NDIR_BLOCKS)
+
/**
* ext4_get_branch - read the chain of indirect blocks leading to data
* @inode: inode in question
@@ -413,9 +443,22 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
if (!p->key)
goto no_block;
while (--depth) {
- bh = sb_bread(sb, le32_to_cpu(p->key));
- if (!bh)
+ bh = sb_getblk(sb, le32_to_cpu(p->key));
+ if (unlikely(!bh))
goto failure;
+
+ if (!bh_uptodate_or_lock(bh)) {
+ if (bh_submit_read(bh) < 0) {
+ put_bh(bh);
+ goto failure;
+ }
+ /* validate block references */
+ if (ext4_check_indirect_blockref(inode, bh)) {
+ put_bh(bh);
+ goto failure;
+ }
+ }
+
add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets);
/* Reader: end */
if (!p->key)
@@ -457,6 +500,8 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
ext4_fsblk_t bg_start;
ext4_fsblk_t last_block;
ext4_grpblk_t colour;
+ ext4_group_t block_group;
+ int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
/* Try to find previous block */
for (p = ind->p - 1; p >= start; p--) {
@@ -472,9 +517,22 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
* It is going to be referred to from the inode itself? OK, just put it
* into the same cylinder group then.
*/
- bg_start = ext4_group_first_block_no(inode->i_sb, ei->i_block_group);
+ block_group = ei->i_block_group;
+ if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
+ block_group &= ~(flex_size-1);
+ if (S_ISREG(inode->i_mode))
+ block_group++;
+ }
+ bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
+ /*
+ * If we are doing delayed allocation, we don't need take
+ * colour into account.
+ */
+ if (test_opt(inode->i_sb, DELALLOC))
+ return bg_start;
+
if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
colour = (current->pid % 16) *
(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
@@ -973,6 +1031,17 @@ out:
return err;
}
+qsize_t ext4_get_reserved_space(struct inode *inode)
+{
+ unsigned long long total;
+
+ spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+ total = EXT4_I(inode)->i_reserved_data_blocks +
+ EXT4_I(inode)->i_reserved_meta_blocks;
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+
+ return total;
+}
/*
* Calculate the number of metadata blocks need to reserve
* to allocate @blocks for non extent file based file
@@ -1034,8 +1103,21 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
/* update per-inode reservations */
BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks);
EXT4_I(inode)->i_reserved_data_blocks -= used;
-
spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+
+ /*
+ * free those over-booking quota for metadata blocks
+ */
+ if (mdb_free)
+ vfs_dq_release_reservation_block(inode, mdb_free);
+
+ /*
+ * If we have done all the pending block allocations and if
+ * there aren't any writers on the inode, we can discard the
+ * inode's preallocations.
+ */
+ if (!total && (atomic_read(&inode->i_writecount) == 0))
+ ext4_discard_preallocations(inode);
}
/*
@@ -1366,6 +1448,10 @@ retry:
goto out;
}
+ /* We cannot recurse into the filesystem as the transaction is already
+ * started */
+ flags |= AOP_FLAG_NOFS;
+
page = grab_cache_page_write_begin(mapping, index, flags);
if (!page) {
ext4_journal_stop(handle);
@@ -1375,7 +1461,7 @@ retry:
*pagep = page;
ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
- ext4_get_block);
+ ext4_get_block);
if (!ret && ext4_should_journal_data(inode)) {
ret = walk_page_buffers(handle, page_buffers(page),
@@ -1547,8 +1633,8 @@ static int ext4_journalled_write_end(struct file *file,
static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
{
int retries = 0;
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- unsigned long md_needed, mdblocks, total = 0;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ unsigned long md_needed, mdblocks, total = 0;
/*
* recalculate the amount of metadata blocks to reserve
@@ -1564,12 +1650,23 @@ repeat:
md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
total = md_needed + nrblocks;
+ /*
+ * Make quota reservation here to prevent quota overflow
+ * later. Real quota accounting is done at pages writeout
+ * time.
+ */
+ if (vfs_dq_reserve_block(inode, total)) {
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+ return -EDQUOT;
+ }
+
if (ext4_claim_free_blocks(sbi, total)) {
spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
yield();
goto repeat;
}
+ vfs_dq_release_reservation_block(inode, total);
return -ENOSPC;
}
EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
@@ -1623,6 +1720,8 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
EXT4_I(inode)->i_reserved_meta_blocks = mdb;
spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+
+ vfs_dq_release_reservation_block(inode, release);
}
static void ext4_da_page_release_reservation(struct page *page,
@@ -1652,9 +1751,10 @@ static void ext4_da_page_release_reservation(struct page *page,
struct mpage_da_data {
struct inode *inode;
- struct buffer_head lbh; /* extent of blocks */
+ sector_t b_blocknr; /* start block number of extent */
+ size_t b_size; /* size of extent */
+ unsigned long b_state; /* state of the extent */
unsigned long first_page, next_page; /* extent of pages */
- get_block_t *get_block;
struct writeback_control *wbc;
int io_done;
int pages_written;
@@ -1668,7 +1768,6 @@ struct mpage_da_data {
* @mpd->inode: inode
* @mpd->first_page: first page of the extent
* @mpd->next_page: page after the last page of the extent
- * @mpd->get_block: the filesystem's block mapper function
*
* By the time mpage_da_submit_io() is called we expect all blocks
* to be allocated. this may be wrong if allocation failed.
@@ -1688,7 +1787,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
/*
* We need to start from the first_page to the next_page - 1
* to make sure we also write the mapped dirty buffer_heads.
- * If we look at mpd->lbh.b_blocknr we would only be looking
+ * If we look at mpd->b_blocknr we would only be looking
* at the currently mapped buffer_heads.
*/
index = mpd->first_page;
@@ -1878,68 +1977,111 @@ static void ext4_print_free_blocks(struct inode *inode)
return;
}
+#define EXT4_DELALLOC_RSVED 1
+static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ int ret;
+ unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+ loff_t disksize = EXT4_I(inode)->i_disksize;
+ handle_t *handle = NULL;
+
+ handle = ext4_journal_current_handle();
+ BUG_ON(!handle);
+ ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
+ bh_result, create, 0, EXT4_DELALLOC_RSVED);
+ if (ret <= 0)
+ return ret;
+
+ bh_result->b_size = (ret << inode->i_blkbits);
+
+ if (ext4_should_order_data(inode)) {
+ int retval;
+ retval = ext4_jbd2_file_inode(handle, inode);
+ if (retval)
+ /*
+ * Failed to add inode for ordered mode. Don't
+ * update file size
+ */
+ return retval;
+ }
+
+ /*
+ * Update on-disk size along with block allocation we don't
+ * use 'extend_disksize' as size may change within already
+ * allocated block -bzzz
+ */
+ disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
+ if (disksize > i_size_read(inode))
+ disksize = i_size_read(inode);
+ if (disksize > EXT4_I(inode)->i_disksize) {
+ ext4_update_i_disksize(inode, disksize);
+ ret = ext4_mark_inode_dirty(handle, inode);
+ return ret;
+ }
+ return 0;
+}
+
/*
* mpage_da_map_blocks - go through given space
*
- * @mpd->lbh - bh describing space
- * @mpd->get_block - the filesystem's block mapper function
+ * @mpd - bh describing space
*
* The function skips space we know is already mapped to disk blocks.
*
*/
-static int mpage_da_map_blocks(struct mpage_da_data *mpd)
+static int mpage_da_map_blocks(struct mpage_da_data *mpd)
{
int err = 0;
struct buffer_head new;
- struct buffer_head *lbh = &mpd->lbh;
sector_t next;
/*
* We consider only non-mapped and non-allocated blocks
*/
- if (buffer_mapped(lbh) && !buffer_delay(lbh))
+ if ((mpd->b_state & (1 << BH_Mapped)) &&
+ !(mpd->b_state & (1 << BH_Delay)))
return 0;
- new.b_state = lbh->b_state;
+ new.b_state = mpd->b_state;
new.b_blocknr = 0;
- new.b_size = lbh->b_size;
- next = lbh->b_blocknr;
+ new.b_size = mpd->b_size;
+ next = mpd->b_blocknr;
/*
* If we didn't accumulate anything
* to write simply return
*/
if (!new.b_size)
return 0;
- err = mpd->get_block(mpd->inode, next, &new, 1);
- if (err) {
- /* If get block returns with error
- * we simply return. Later writepage
- * will redirty the page and writepages
- * will find the dirty page again
+ err = ext4_da_get_block_write(mpd->inode, next, &new, 1);
+ if (err) {
+ /*
+ * If get block returns with error we simply
+ * return. Later writepage will redirty the page and
+ * writepages will find the dirty page again
*/
if (err == -EAGAIN)
return 0;
if (err == -ENOSPC &&
- ext4_count_free_blocks(mpd->inode->i_sb)) {
+ ext4_count_free_blocks(mpd->inode->i_sb)) {
mpd->retval = err;
return 0;
}
/*
- * get block failure will cause us
- * to loop in writepages. Because
- * a_ops->writepage won't be able to
- * make progress. The page will be redirtied
- * by writepage and writepages will again
- * try to write the same.
+ * get block failure will cause us to loop in
+ * writepages, because a_ops->writepage won't be able
+ * to make progress. The page will be redirtied by
+ * writepage and writepages will again try to write
+ * the same.
*/
printk(KERN_EMERG "%s block allocation failed for inode %lu "
"at logical offset %llu with max blocks "
"%zd with error %d\n",
__func__, mpd->inode->i_ino,
(unsigned long long)next,
- lbh->b_size >> mpd->inode->i_blkbits, err);
+ mpd->b_size >> mpd->inode->i_blkbits, err);
printk(KERN_EMERG "This should not happen.!! "
"Data will be lost\n");
if (err == -ENOSPC) {
@@ -1947,7 +2089,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
}
/* invlaidate all the pages */
ext4_da_block_invalidatepages(mpd, next,
- lbh->b_size >> mpd->inode->i_blkbits);
+ mpd->b_size >> mpd->inode->i_blkbits);
return err;
}
BUG_ON(new.b_size == 0);
@@ -1959,7 +2101,8 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
* If blocks are delayed marked, we need to
* put actual blocknr and drop delayed bit
*/
- if (buffer_delay(lbh) || buffer_unwritten(lbh))
+ if ((mpd->b_state & (1 << BH_Delay)) ||
+ (mpd->b_state & (1 << BH_Unwritten)))
mpage_put_bnr_to_bhs(mpd, next, &new);
return 0;
@@ -1978,12 +2121,11 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
* the function is used to collect contig. blocks in same state
*/
static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
- sector_t logical, struct buffer_head *bh)
+ sector_t logical, size_t b_size,
+ unsigned long b_state)
{
sector_t next;
- size_t b_size = bh->b_size;
- struct buffer_head *lbh = &mpd->lbh;
- int nrblocks = lbh->b_size >> mpd->inode->i_blkbits;
+ int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
/* check if thereserved journal credits might overflow */
if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) {
@@ -2010,19 +2152,19 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
/*
* First block in the extent
*/
- if (lbh->b_size == 0) {
- lbh->b_blocknr = logical;
- lbh->b_size = b_size;
- lbh->b_state = bh->b_state & BH_FLAGS;
+ if (mpd->b_size == 0) {
+ mpd->b_blocknr = logical;
+ mpd->b_size = b_size;
+ mpd->b_state = b_state & BH_FLAGS;
return;
}
- next = lbh->b_blocknr + nrblocks;
+ next = mpd->b_blocknr + nrblocks;
/*
* Can we merge the block to our big extent?
*/
- if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) {
- lbh->b_size += b_size;
+ if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
+ mpd->b_size += b_size;
return;
}
@@ -2051,7 +2193,7 @@ static int __mpage_da_writepage(struct page *page,
{
struct mpage_da_data *mpd = data;
struct inode *inode = mpd->inode;
- struct buffer_head *bh, *head, fake;
+ struct buffer_head *bh, *head;
sector_t logical;
if (mpd->io_done) {
@@ -2093,9 +2235,9 @@ static int __mpage_da_writepage(struct page *page,
/*
* ... and blocks
*/
- mpd->lbh.b_size = 0;
- mpd->lbh.b_state = 0;
- mpd->lbh.b_blocknr = 0;
+ mpd->b_size = 0;
+ mpd->b_state = 0;
+ mpd->b_blocknr = 0;
}
mpd->next_page = page->index + 1;
@@ -2103,16 +2245,8 @@ static int __mpage_da_writepage(struct page *page,
(PAGE_CACHE_SHIFT - inode->i_blkbits);
if (!page_has_buffers(page)) {
- /*
- * There is no attached buffer heads yet (mmap?)
- * we treat the page asfull of dirty blocks
- */
- bh = &fake;
- bh->b_size = PAGE_CACHE_SIZE;
- bh->b_state = 0;
- set_buffer_dirty(bh);
- set_buffer_uptodate(bh);
- mpage_add_bh_to_extent(mpd, logical, bh);
+ mpage_add_bh_to_extent(mpd, logical, PAGE_CACHE_SIZE,
+ (1 << BH_Dirty) | (1 << BH_Uptodate));
if (mpd->io_done)
return MPAGE_DA_EXTENT_TAIL;
} else {
@@ -2130,8 +2264,10 @@ static int __mpage_da_writepage(struct page *page,
* with the page in ext4_da_writepage
*/
if (buffer_dirty(bh) &&
- (!buffer_mapped(bh) || buffer_delay(bh))) {
- mpage_add_bh_to_extent(mpd, logical, bh);
+ (!buffer_mapped(bh) || buffer_delay(bh))) {
+ mpage_add_bh_to_extent(mpd, logical,
+ bh->b_size,
+ bh->b_state);
if (mpd->io_done)
return MPAGE_DA_EXTENT_TAIL;
} else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
@@ -2143,9 +2279,8 @@ static int __mpage_da_writepage(struct page *page,
* unmapped buffer_head later we need to
* use the b_state flag of that buffer_head.
*/
- if (mpd->lbh.b_size == 0)
- mpd->lbh.b_state =
- bh->b_state & BH_FLAGS;
+ if (mpd->b_size == 0)
+ mpd->b_state = bh->b_state & BH_FLAGS;
}
logical++;
} while ((bh = bh->b_this_page) != head);
@@ -2155,51 +2290,6 @@ static int __mpage_da_writepage(struct page *page,
}
/*
- * mpage_da_writepages - walk the list of dirty pages of the given
- * address space, allocates non-allocated blocks, maps newly-allocated
- * blocks to existing bhs and issue IO them
- *
- * @mapping: address space structure to write
- * @wbc: subtract the number of written pages from *@wbc->nr_to_write
- * @get_block: the filesystem's block mapper function.
- *
- * This is a library function, which implements the writepages()
- * address_space_operation.
- */
-static int mpage_da_writepages(struct address_space *mapping,
- struct writeback_control *wbc,
- struct mpage_da_data *mpd)
-{
- int ret;
-
- if (!mpd->get_block)
- return generic_writepages(mapping, wbc);
-
- mpd->lbh.b_size = 0;
- mpd->lbh.b_state = 0;
- mpd->lbh.b_blocknr = 0;
- mpd->first_page = 0;
- mpd->next_page = 0;
- mpd->io_done = 0;
- mpd->pages_written = 0;
- mpd->retval = 0;
-
- ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd);
- /*
- * Handle last extent of pages
- */
- if (!mpd->io_done && mpd->next_page != mpd->first_page) {
- if (mpage_da_map_blocks(mpd) == 0)
- mpage_da_submit_io(mpd);
-
- mpd->io_done = 1;
- ret = MPAGE_DA_EXTENT_TAIL;
- }
- wbc->nr_to_write -= mpd->pages_written;
- return ret;
-}
-
-/*
* this is a special callback for ->write_begin() only
* it's intention is to return mapped block or reserve space
*/
@@ -2238,51 +2328,6 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
return ret;
}
-#define EXT4_DELALLOC_RSVED 1
-static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
-{
- int ret;
- unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
- loff_t disksize = EXT4_I(inode)->i_disksize;
- handle_t *handle = NULL;
-
- handle = ext4_journal_current_handle();
- BUG_ON(!handle);
- ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
- bh_result, create, 0, EXT4_DELALLOC_RSVED);
- if (ret > 0) {
-
- bh_result->b_size = (ret << inode->i_blkbits);
-
- if (ext4_should_order_data(inode)) {
- int retval;
- retval = ext4_jbd2_file_inode(handle, inode);
- if (retval)
- /*
- * Failed to add inode for ordered
- * mode. Don't update file size
- */
- return retval;
- }
-
- /*
- * Update on-disk size along with block allocation
- * we don't use 'extend_disksize' as size may change
- * within already allocated block -bzzz
- */
- disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
- if (disksize > i_size_read(inode))
- disksize = i_size_read(inode);
- if (disksize > EXT4_I(inode)->i_disksize) {
- ext4_update_i_disksize(inode, disksize);
- ret = ext4_mark_inode_dirty(handle, inode);
- return ret;
- }
- ret = 0;
- }
- return ret;
-}
static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
{
@@ -2437,6 +2482,7 @@ static int ext4_da_writepages(struct address_space *mapping,
int no_nrwrite_index_update;
int pages_written = 0;
long pages_skipped;
+ int range_cyclic, cycled = 1, io_done = 0;
int needed_blocks, ret = 0, nr_to_writebump = 0;
struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
@@ -2488,9 +2534,15 @@ static int ext4_da_writepages(struct address_space *mapping,
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = 1;
- if (wbc->range_cyclic)
+ range_cyclic = wbc->range_cyclic;
+ if (wbc->range_cyclic) {
index = mapping->writeback_index;
- else
+ if (index)
+ cycled = 0;
+ wbc->range_start = index << PAGE_CACHE_SHIFT;
+ wbc->range_end = LLONG_MAX;
+ wbc->range_cyclic = 0;
+ } else
index = wbc->range_start >> PAGE_CACHE_SHIFT;
mpd.wbc = wbc;
@@ -2504,6 +2556,7 @@ static int ext4_da_writepages(struct address_space *mapping,
wbc->no_nrwrite_index_update = 1;
pages_skipped = wbc->pages_skipped;
+retry:
while (!ret && wbc->nr_to_write > 0) {
/*
@@ -2525,12 +2578,42 @@ static int ext4_da_writepages(struct address_space *mapping,
dump_stack();
goto out_writepages;
}
- mpd.get_block = ext4_da_get_block_write;
- ret = mpage_da_writepages(mapping, wbc, &mpd);
+
+ /*
+ * Now call __mpage_da_writepage to find the next
+ * contiguous region of logical blocks that need
+ * blocks to be allocated by ext4. We don't actually
+ * submit the blocks for I/O here, even though
+ * write_cache_pages thinks it will, and will set the
+ * pages as clean for write before calling
+ * __mpage_da_writepage().
+ */
+ mpd.b_size = 0;
+ mpd.b_state = 0;
+ mpd.b_blocknr = 0;
+ mpd.first_page = 0;
+ mpd.next_page = 0;
+ mpd.io_done = 0;
+ mpd.pages_written = 0;
+ mpd.retval = 0;
+ ret = write_cache_pages(mapping, wbc, __mpage_da_writepage,
+ &mpd);
+ /*
+ * If we have a contigous extent of pages and we
+ * haven't done the I/O yet, map the blocks and submit
+ * them for I/O.
+ */
+ if (!mpd.io_done && mpd.next_page != mpd.first_page) {
+ if (mpage_da_map_blocks(&mpd) == 0)
+ mpage_da_submit_io(&mpd);
+ mpd.io_done = 1;
+ ret = MPAGE_DA_EXTENT_TAIL;
+ }
+ wbc->nr_to_write -= mpd.pages_written;
ext4_journal_stop(handle);
- if (mpd.retval == -ENOSPC) {
+ if ((mpd.retval == -ENOSPC) && sbi->s_journal) {
/* commit the transaction which would
* free blocks released in the transaction
* and try again
@@ -2546,6 +2629,7 @@ static int ext4_da_writepages(struct address_space *mapping,
pages_written += mpd.pages_written;
wbc->pages_skipped = pages_skipped;
ret = 0;
+ io_done = 1;
} else if (wbc->nr_to_write)
/*
* There is no more writeout needed
@@ -2554,6 +2638,13 @@ static int ext4_da_writepages(struct address_space *mapping,
*/
break;
}
+ if (!io_done && !cycled) {
+ cycled = 1;
+ index = 0;
+ wbc->range_start = index << PAGE_CACHE_SHIFT;
+ wbc->range_end = mapping->writeback_index - 1;
+ goto retry;
+ }
if (pages_skipped != wbc->pages_skipped)
printk(KERN_EMERG "This should not happen leaving %s "
"with nr_to_write = %ld ret = %d\n",
@@ -2561,6 +2652,7 @@ static int ext4_da_writepages(struct address_space *mapping,
/* Update index */
index += pages_written;
+ wbc->range_cyclic = range_cyclic;
if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
/*
* set the writeback_index so that range_cyclic
@@ -2648,6 +2740,9 @@ retry:
ret = PTR_ERR(handle);
goto out;
}
+ /* We cannot recurse into the filesystem as the transaction is already
+ * started */
+ flags |= AOP_FLAG_NOFS;
page = grab_cache_page_write_begin(mapping, index, flags);
if (!page) {
@@ -2790,6 +2885,48 @@ out:
return;
}
+/*
+ * Force all delayed allocation blocks to be allocated for a given inode.
+ */
+int ext4_alloc_da_blocks(struct inode *inode)
+{
+ if (!EXT4_I(inode)->i_reserved_data_blocks &&
+ !EXT4_I(inode)->i_reserved_meta_blocks)
+ return 0;
+
+ /*
+ * We do something simple for now. The filemap_flush() will
+ * also start triggering a write of the data blocks, which is
+ * not strictly speaking necessary (and for users of
+ * laptop_mode, not even desirable). However, to do otherwise
+ * would require replicating code paths in:
+ *
+ * ext4_da_writepages() ->
+ * write_cache_pages() ---> (via passed in callback function)
+ * __mpage_da_writepage() -->
+ * mpage_add_bh_to_extent()
+ * mpage_da_map_blocks()
+ *
+ * The problem is that write_cache_pages(), located in
+ * mm/page-writeback.c, marks pages clean in preparation for
+ * doing I/O, which is not desirable if we're not planning on
+ * doing I/O at all.
+ *
+ * We could call write_cache_pages(), and then redirty all of
+ * the pages by calling redirty_page_for_writeback() but that
+ * would be ugly in the extreme. So instead we would need to
+ * replicate parts of the code in the above functions,
+ * simplifying them becuase we wouldn't actually intend to
+ * write out the pages, but rather only collect contiguous
+ * logical block extents, call the multi-block allocator, and
+ * then update the buffer heads with the block allocations.
+ *
+ * For now, though, we'll cheat by calling filemap_flush(),
+ * which will map the blocks, and start the I/O, but not
+ * actually wait for the I/O to complete.
+ */
+ return filemap_flush(inode->i_mapping);
+}
/*
* bmap() is special. It gets used by applications such as lilo and by
@@ -3812,6 +3949,9 @@ void ext4_truncate(struct inode *inode)
if (!ext4_can_truncate(inode))
return;
+ if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
+ ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
+
if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
ext4_ext_truncate(inode);
return;
@@ -4054,12 +4194,7 @@ make_io:
unsigned num;
table = ext4_inode_table(sb, gdp);
- /* Make sure s_inode_readahead_blks is a power of 2 */
- while (EXT4_SB(sb)->s_inode_readahead_blks &
- (EXT4_SB(sb)->s_inode_readahead_blks-1))
- EXT4_SB(sb)->s_inode_readahead_blks =
- (EXT4_SB(sb)->s_inode_readahead_blks &
- (EXT4_SB(sb)->s_inode_readahead_blks-1));
+ /* s_inode_readahead_blks is always a power of 2 */
b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1);
if (table > b)
b = table;
@@ -4231,6 +4366,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
ei->i_disksize = inode->i_size;
inode->i_generation = le32_to_cpu(raw_inode->i_generation);
ei->i_block_group = iloc.block_group;
+ ei->i_last_alloc_group = ~0;
/*
* NOTE! The in-memory inode i_data array is in little-endian order
* even on big-endian machines: we do NOT byteswap the block numbers!
@@ -4273,6 +4409,20 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
(__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
}
+ if (ei->i_flags & EXT4_EXTENTS_FL) {
+ /* Validate extent which is part of inode */
+ ret = ext4_ext_check_inode(inode);
+ } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ (S_ISLNK(inode->i_mode) &&
+ !ext4_inode_is_fast_symlink(inode))) {
+ /* Validate block references which are part of inode */
+ ret = ext4_check_inode_blockref(inode);
+ }
+ if (ret) {
+ brelse(bh);
+ goto bad_inode;
+ }
+
if (S_ISREG(inode->i_mode)) {
inode->i_op = &ext4_file_inode_operations;
inode->i_fop = &ext4_file_operations;
@@ -4289,7 +4439,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
inode->i_op = &ext4_symlink_inode_operations;
ext4_set_aops(inode);
}
- } else {
+ } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
+ S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
inode->i_op = &ext4_special_inode_operations;
if (raw_inode->i_block[0])
init_special_inode(inode, inode->i_mode,
@@ -4297,6 +4448,13 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
else
init_special_inode(inode, inode->i_mode,
new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
+ } else {
+ brelse(bh);
+ ret = -EIO;
+ ext4_error(inode->i_sb, __func__,
+ "bogus i_mode (%o) for inode=%lu",
+ inode->i_mode, inode->i_ino);
+ goto bad_inode;
}
brelse(iloc.bh);
ext4_set_inode_flags(inode);
@@ -4586,7 +4744,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
error = PTR_ERR(handle);
goto err_out;
}
- error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
+ error = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
if (error) {
ext4_journal_stop(handle);
return error;
@@ -4965,7 +5123,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
* i_size has been changed by generic_commit_write() and we thus need
* to include the updated inode in the current transaction.
*
- * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks
+ * Also, vfs_dq_alloc_block() will always dirty the inode when blocks
* are allocated to the file.
*
* If the inode is marked synchronous, we don't honour that here - doing
@@ -5090,8 +5248,9 @@ static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
return !buffer_mapped(bh);
}
-int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
+ struct page *page = vmf->page;
loff_t size;
unsigned long len;
int ret = -EINVAL;
@@ -5143,6 +5302,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
goto out_unlock;
ret = 0;
out_unlock:
+ if (ret)
+ ret = VM_FAULT_SIGBUS;
up_read(&inode->i_alloc_sem);
return ret;
}
OpenPOWER on IntegriCloud