diff options
Diffstat (limited to 'fs/ext4')
-rw-r--r-- | fs/ext4/acl.c | 188 | ||||
-rw-r--r-- | fs/ext4/balloc.c | 14 | ||||
-rw-r--r-- | fs/ext4/dir.c | 20 | ||||
-rw-r--r-- | fs/ext4/ext4.h | 5 | ||||
-rw-r--r-- | fs/ext4/ext4_extents.h | 4 | ||||
-rw-r--r-- | fs/ext4/ext4_jbd2.h | 8 | ||||
-rw-r--r-- | fs/ext4/extents.c | 168 | ||||
-rw-r--r-- | fs/ext4/ialloc.c | 60 | ||||
-rw-r--r-- | fs/ext4/inode.c | 634 | ||||
-rw-r--r-- | fs/ext4/mballoc.c | 293 | ||||
-rw-r--r-- | fs/ext4/mballoc.h | 10 | ||||
-rw-r--r-- | fs/ext4/migrate.c | 3 | ||||
-rw-r--r-- | fs/ext4/resize.c | 82 | ||||
-rw-r--r-- | fs/ext4/super.c | 317 | ||||
-rw-r--r-- | fs/ext4/xattr.c | 2 |
15 files changed, 1152 insertions, 656 deletions
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index c7d04e1..694ed6f 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -40,34 +40,35 @@ ext4_acl_from_disk(const void *value, size_t size) acl = posix_acl_alloc(count, GFP_NOFS); if (!acl) return ERR_PTR(-ENOMEM); - for (n=0; n < count; n++) { + for (n = 0; n < count; n++) { ext4_acl_entry *entry = (ext4_acl_entry *)value; if ((char *)value + sizeof(ext4_acl_entry_short) > end) goto fail; acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag); acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm); - switch(acl->a_entries[n].e_tag) { - case ACL_USER_OBJ: - case ACL_GROUP_OBJ: - case ACL_MASK: - case ACL_OTHER: - value = (char *)value + - sizeof(ext4_acl_entry_short); - acl->a_entries[n].e_id = ACL_UNDEFINED_ID; - break; - - case ACL_USER: - case ACL_GROUP: - value = (char *)value + sizeof(ext4_acl_entry); - if ((char *)value > end) - goto fail; - acl->a_entries[n].e_id = - le32_to_cpu(entry->e_id); - break; - - default: + + switch (acl->a_entries[n].e_tag) { + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + value = (char *)value + + sizeof(ext4_acl_entry_short); + acl->a_entries[n].e_id = ACL_UNDEFINED_ID; + break; + + case ACL_USER: + case ACL_GROUP: + value = (char *)value + sizeof(ext4_acl_entry); + if ((char *)value > end) goto fail; + acl->a_entries[n].e_id = + le32_to_cpu(entry->e_id); + break; + + default: + goto fail; } } if (value != end) @@ -96,27 +97,26 @@ ext4_acl_to_disk(const struct posix_acl *acl, size_t *size) return ERR_PTR(-ENOMEM); ext_acl->a_version = cpu_to_le32(EXT4_ACL_VERSION); e = (char *)ext_acl + sizeof(ext4_acl_header); - for (n=0; n < acl->a_count; n++) { + for (n = 0; n < acl->a_count; n++) { ext4_acl_entry *entry = (ext4_acl_entry *)e; entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag); entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm); - switch(acl->a_entries[n].e_tag) { - case ACL_USER: - case ACL_GROUP: - entry->e_id = - cpu_to_le32(acl->a_entries[n].e_id); - e += sizeof(ext4_acl_entry); - break; - - case ACL_USER_OBJ: - case ACL_GROUP_OBJ: - case ACL_MASK: - case ACL_OTHER: - e += sizeof(ext4_acl_entry_short); - break; - - default: - goto fail; + switch (acl->a_entries[n].e_tag) { + case ACL_USER: + case ACL_GROUP: + entry->e_id = cpu_to_le32(acl->a_entries[n].e_id); + e += sizeof(ext4_acl_entry); + break; + + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + e += sizeof(ext4_acl_entry_short); + break; + + default: + goto fail; } } return (char *)ext_acl; @@ -167,23 +167,23 @@ ext4_get_acl(struct inode *inode, int type) if (!test_opt(inode->i_sb, POSIX_ACL)) return NULL; - switch(type) { - case ACL_TYPE_ACCESS: - acl = ext4_iget_acl(inode, &ei->i_acl); - if (acl != EXT4_ACL_NOT_CACHED) - return acl; - name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; - break; - - case ACL_TYPE_DEFAULT: - acl = ext4_iget_acl(inode, &ei->i_default_acl); - if (acl != EXT4_ACL_NOT_CACHED) - return acl; - name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT; - break; - - default: - return ERR_PTR(-EINVAL); + switch (type) { + case ACL_TYPE_ACCESS: + acl = ext4_iget_acl(inode, &ei->i_acl); + if (acl != EXT4_ACL_NOT_CACHED) + return acl; + name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; + break; + + case ACL_TYPE_DEFAULT: + acl = ext4_iget_acl(inode, &ei->i_default_acl); + if (acl != EXT4_ACL_NOT_CACHED) + return acl; + name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT; + break; + + default: + return ERR_PTR(-EINVAL); } retval = ext4_xattr_get(inode, name_index, "", NULL, 0); if (retval > 0) { @@ -201,14 +201,14 @@ ext4_get_acl(struct inode *inode, int type) kfree(value); if (!IS_ERR(acl)) { - switch(type) { - case ACL_TYPE_ACCESS: - ext4_iset_acl(inode, &ei->i_acl, acl); - break; - - case ACL_TYPE_DEFAULT: - ext4_iset_acl(inode, &ei->i_default_acl, acl); - break; + switch (type) { + case ACL_TYPE_ACCESS: + ext4_iset_acl(inode, &ei->i_acl, acl); + break; + + case ACL_TYPE_DEFAULT: + ext4_iset_acl(inode, &ei->i_default_acl, acl); + break; } } return acl; @@ -232,31 +232,31 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type, if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; - switch(type) { - case ACL_TYPE_ACCESS: - name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; - if (acl) { - mode_t mode = inode->i_mode; - error = posix_acl_equiv_mode(acl, &mode); - if (error < 0) - return error; - else { - inode->i_mode = mode; - ext4_mark_inode_dirty(handle, inode); - if (error == 0) - acl = NULL; - } + switch (type) { + case ACL_TYPE_ACCESS: + name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; + if (acl) { + mode_t mode = inode->i_mode; + error = posix_acl_equiv_mode(acl, &mode); + if (error < 0) + return error; + else { + inode->i_mode = mode; + ext4_mark_inode_dirty(handle, inode); + if (error == 0) + acl = NULL; } - break; + } + break; - case ACL_TYPE_DEFAULT: - name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT; - if (!S_ISDIR(inode->i_mode)) - return acl ? -EACCES : 0; - break; + case ACL_TYPE_DEFAULT: + name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT; + if (!S_ISDIR(inode->i_mode)) + return acl ? -EACCES : 0; + break; - default: - return -EINVAL; + default: + return -EINVAL; } if (acl) { value = ext4_acl_to_disk(acl, &size); @@ -269,14 +269,14 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type, kfree(value); if (!error) { - switch(type) { - case ACL_TYPE_ACCESS: - ext4_iset_acl(inode, &ei->i_acl, acl); - break; - - case ACL_TYPE_DEFAULT: - ext4_iset_acl(inode, &ei->i_default_acl, acl); - break; + switch (type) { + case ACL_TYPE_ACCESS: + ext4_iset_acl(inode, &ei->i_acl, acl); + break; + + case ACL_TYPE_DEFAULT: + ext4_iset_acl(inode, &ei->i_default_acl, acl); + break; } } return error; diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 495ab21..e9fa960 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -314,25 +314,28 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) if (unlikely(!bh)) { ext4_error(sb, __func__, "Cannot read block bitmap - " - "block_group = %d, block_bitmap = %llu", - (int)block_group, (unsigned long long)bitmap_blk); + "block_group = %lu, block_bitmap = %llu", + block_group, bitmap_blk); return NULL; } if (bh_uptodate_or_lock(bh)) return bh; + spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group)); if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { ext4_init_block_bitmap(sb, bh, block_group, desc); set_buffer_uptodate(bh); unlock_buffer(bh); + spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); return bh; } + spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); if (bh_submit_read(bh) < 0) { put_bh(bh); ext4_error(sb, __func__, "Cannot read block bitmap - " - "block_group = %d, block_bitmap = %llu", - (int)block_group, (unsigned long long)bitmap_blk); + "block_group = %lu, block_bitmap = %llu", + block_group, bitmap_blk); return NULL; } ext4_valid_block_bitmap(sb, desc, block_group, bh); @@ -1623,6 +1626,9 @@ ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi, free_blocks = percpu_counter_sum_and_set(&sbi->s_freeblocks_counter); #endif + if (free_blocks <= root_blocks) + /* we don't have free space */ + return 0; if (free_blocks - root_blocks < nblocks) return free_blocks - root_blocks; return nblocks; diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index d3d23d7..ec8e33b 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -411,7 +411,7 @@ static int call_filldir(struct file * filp, void * dirent, get_dtype(sb, fname->file_type)); if (error) { filp->f_pos = curr_pos; - info->extra_fname = fname->next; + info->extra_fname = fname; return error; } fname = fname->next; @@ -450,11 +450,21 @@ static int ext4_dx_readdir(struct file * filp, * If there are any leftover names on the hash collision * chain, return them first. */ - if (info->extra_fname && - call_filldir(filp, dirent, filldir, info->extra_fname)) - goto finished; + if (info->extra_fname) { + if (call_filldir(filp, dirent, filldir, info->extra_fname)) + goto finished; - if (!info->curr_node) + info->extra_fname = NULL; + info->curr_node = rb_next(info->curr_node); + if (!info->curr_node) { + if (info->next_hash == ~0) { + filp->f_pos = EXT4_HTREE_EOF; + goto finished; + } + info->curr_hash = info->next_hash; + info->curr_minor_hash = 0; + } + } else if (!info->curr_node) info->curr_node = rb_first(&info->root); while (1) { diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 303e41c..2950032 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1044,7 +1044,6 @@ extern void ext4_mb_update_group_info(struct ext4_group_info *grp, /* inode.c */ -void ext4_da_release_space(struct inode *inode, int used, int to_free); int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, struct buffer_head *bh, ext4_fsblk_t blocknr); struct buffer_head *ext4_getblk(handle_t *, struct inode *, @@ -1073,6 +1072,8 @@ extern void ext4_set_inode_flags(struct inode *); extern void ext4_get_inode_flags(struct ext4_inode_info *); extern void ext4_set_aops(struct inode *inode); extern int ext4_writepage_trans_blocks(struct inode *); +extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks); +extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); extern int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from); extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page); @@ -1228,6 +1229,8 @@ extern const struct inode_operations ext4_fast_symlink_inode_operations; /* extents.c */ extern int ext4_ext_tree_init(handle_t *handle, struct inode *); extern int ext4_ext_writepage_trans_blocks(struct inode *, int); +extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, + int chunk); extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, ext4_lblk_t iblock, unsigned long max_blocks, struct buffer_head *bh_result, diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 6c166c0..d33dc56 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -216,7 +216,9 @@ extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks); extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *); extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t); extern int ext4_extent_tree_init(handle_t *, struct inode *); -extern int ext4_ext_calc_credits_for_insert(struct inode *, struct ext4_ext_path *); +extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, + int num, + struct ext4_ext_path *path); extern int ext4_ext_try_to_merge(struct inode *inode, struct ext4_ext_path *path, struct ext4_extent *); diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index eb8bc3a..b455c68 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -51,6 +51,14 @@ EXT4_XATTR_TRANS_BLOCKS - 2 + \ 2*EXT4_QUOTA_TRANS_BLOCKS(sb)) +/* + * Define the number of metadata blocks we need to account to modify data. + * + * This include super block, inode block, quota blocks and xattr blocks + */ +#define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \ + 2*EXT4_QUOTA_TRANS_BLOCKS(sb)) + /* Delete operations potentially hit one directory's namespace plus an * entire inode, plus arbitrary amounts of bitmap/indirection data. Be * generous. We can grow the delete transaction later if necessary. */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 42c4c0c..b24d3c5 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -99,7 +99,7 @@ static int ext4_ext_journal_restart(handle_t *handle, int needed) if (handle->h_buffer_credits > needed) return 0; err = ext4_journal_extend(handle, needed); - if (err) + if (err <= 0) return err; return ext4_journal_restart(handle, needed); } @@ -1441,7 +1441,7 @@ unsigned int ext4_ext_check_overlap(struct inode *inode, /* * get the next allocated block if the extent in the path - * is before the requested block(s) + * is before the requested block(s) */ if (b2 < b1) { b2 = ext4_ext_next_allocated_block(path); @@ -1747,54 +1747,61 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, } /* - * ext4_ext_calc_credits_for_insert: - * This routine returns max. credits that the extent tree can consume. - * It should be OK for low-performance paths like ->writepage() - * To allow many writing processes to fit into a single transaction, - * the caller should calculate credits under i_data_sem and - * pass the actual path. + * ext4_ext_calc_credits_for_single_extent: + * This routine returns max. credits that needed to insert an extent + * to the extent tree. + * When pass the actual path, the caller should calculate credits + * under i_data_sem. */ -int ext4_ext_calc_credits_for_insert(struct inode *inode, +int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks, struct ext4_ext_path *path) { - int depth, needed; - if (path) { + int depth = ext_depth(inode); + int ret = 0; + /* probably there is space in leaf? */ - depth = ext_depth(inode); if (le16_to_cpu(path[depth].p_hdr->eh_entries) - < le16_to_cpu(path[depth].p_hdr->eh_max)) - return 1; - } - - /* - * given 32-bit logical block (4294967296 blocks), max. tree - * can be 4 levels in depth -- 4 * 340^4 == 53453440000. - * Let's also add one more level for imbalance. - */ - depth = 5; + < le16_to_cpu(path[depth].p_hdr->eh_max)) { - /* allocation of new data block(s) */ - needed = 2; + /* + * There are some space in the leaf tree, no + * need to account for leaf block credit + * + * bitmaps and block group descriptor blocks + * and other metadat blocks still need to be + * accounted. + */ + /* 1 bitmap, 1 block group descriptor */ + ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb); + } + } - /* - * tree can be full, so it would need to grow in depth: - * we need one credit to modify old root, credits for - * new root will be added in split accounting - */ - needed += 1; + return ext4_chunk_trans_blocks(inode, nrblocks); +} - /* - * Index split can happen, we would need: - * allocate intermediate indexes (bitmap + group) - * + change two blocks at each level, but root (already included) - */ - needed += (depth * 2) + (depth * 2); +/* + * How many index/leaf blocks need to change/allocate to modify nrblocks? + * + * if nrblocks are fit in a single extent (chunk flag is 1), then + * in the worse case, each tree level index/leaf need to be changed + * if the tree split due to insert a new extent, then the old tree + * index/leaf need to be updated too + * + * If the nrblocks are discontiguous, they could cause + * the whole tree split more than once, but this is really rare. + */ +int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) +{ + int index; + int depth = ext_depth(inode); - /* any allocation modifies superblock */ - needed += 1; + if (chunk) + index = depth * 2; + else + index = depth * 3; - return needed; + return index; } static int ext4_remove_blocks(handle_t *handle, struct inode *inode, @@ -1910,16 +1917,18 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, BUG_ON(b != ex_ee_block + ex_ee_len - 1); } - /* at present, extent can't cross block group: */ - /* leaf + bitmap + group desc + sb + inode */ - credits = 5; + /* + * 3 for leaf, sb, and inode plus 2 (bmap and group + * descriptor) for each block group; assume two block + * groups plus ex_ee_len/blocks_per_block_group for + * the worst case + */ + credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb)); if (ex == EXT_FIRST_EXTENT(eh)) { correct_index = 1; credits += (ext_depth(inode)) + 1; } -#ifdef CONFIG_QUOTA credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); -#endif err = ext4_ext_journal_restart(handle, credits); if (err) @@ -2323,7 +2332,10 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, unsigned int newdepth; /* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */ if (allocated <= EXT4_EXT_ZERO_LEN) { - /* Mark first half uninitialized. + /* + * iblock == ee_block is handled by the zerouout + * at the beginning. + * Mark first half uninitialized. * Mark second half initialized and zero out the * initialized extent */ @@ -2346,7 +2358,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ex->ee_len = orig_ex.ee_len; ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); ext4_ext_dirty(handle, inode, path + depth); - /* zeroed the full extent */ + /* blocks available from iblock */ return allocated; } else if (err) @@ -2374,6 +2386,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, err = PTR_ERR(path); return err; } + /* get the second half extent details */ ex = path[depth].p_ext; err = ext4_ext_get_access(handle, inode, path + depth); @@ -2403,6 +2416,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); ext4_ext_dirty(handle, inode, path + depth); /* zeroed the full extent */ + /* blocks available from iblock */ return allocated; } else if (err) @@ -2418,23 +2432,22 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, */ orig_ex.ee_len = cpu_to_le16(ee_len - ext4_ext_get_actual_len(ex3)); - if (newdepth != depth) { - depth = newdepth; - ext4_ext_drop_refs(path); - path = ext4_ext_find_extent(inode, iblock, path); - if (IS_ERR(path)) { - err = PTR_ERR(path); - goto out; - } - eh = path[depth].p_hdr; - ex = path[depth].p_ext; - if (ex2 != &newex) - ex2 = ex; - - err = ext4_ext_get_access(handle, inode, path + depth); - if (err) - goto out; + depth = newdepth; + ext4_ext_drop_refs(path); + path = ext4_ext_find_extent(inode, iblock, path); + if (IS_ERR(path)) { + err = PTR_ERR(path); + goto out; } + eh = path[depth].p_hdr; + ex = path[depth].p_ext; + if (ex2 != &newex) + ex2 = ex; + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + allocated = max_blocks; /* If extent has less than EXT4_EXT_ZERO_LEN and we are trying @@ -2452,6 +2465,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); ext4_ext_dirty(handle, inode, path + depth); /* zero out the first half */ + /* blocks available from iblock */ return allocated; } } @@ -2796,7 +2810,7 @@ void ext4_ext_truncate(struct inode *inode) /* * probably first extent we're gonna free will be last in block */ - err = ext4_writepage_trans_blocks(inode) + 3; + err = ext4_writepage_trans_blocks(inode); handle = ext4_journal_start(inode, err); if (IS_ERR(handle)) return; @@ -2810,7 +2824,7 @@ void ext4_ext_truncate(struct inode *inode) down_write(&EXT4_I(inode)->i_data_sem); ext4_ext_invalidate_cache(inode); - ext4_mb_discard_inode_preallocations(inode); + ext4_discard_reservation(inode); /* * TODO: optimization is possible here. @@ -2849,27 +2863,6 @@ out_stop: ext4_journal_stop(handle); } -/* - * ext4_ext_writepage_trans_blocks: - * calculate max number of blocks we could modify - * in order to allocate new block for an inode - */ -int ext4_ext_writepage_trans_blocks(struct inode *inode, int num) -{ - int needed; - - needed = ext4_ext_calc_credits_for_insert(inode, NULL); - - /* caller wants to allocate num blocks, but note it includes sb */ - needed = needed * num - (num - 1); - -#ifdef CONFIG_QUOTA - needed += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); -#endif - - return needed; -} - static void ext4_falloc_update_inode(struct inode *inode, int mode, loff_t new_size, int update_ctime) { @@ -2930,10 +2923,9 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) - block; /* - * credits to insert 1 extent into extent tree + buffers to be able to - * modify 1 super block, 1 block bitmap and 1 group descriptor. + * credits to insert 1 extent into extent tree */ - credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + 3; + credits = ext4_chunk_trans_blocks(inode, max_blocks); mutex_lock(&inode->i_mutex); retry: while (ret >= 0 && ret < max_blocks) { diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index a92eb30..f344834 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -97,34 +97,44 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh, * Return buffer_head of bitmap on success or NULL. */ static struct buffer_head * -read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) +ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) { struct ext4_group_desc *desc; struct buffer_head *bh = NULL; + ext4_fsblk_t bitmap_blk; desc = ext4_get_group_desc(sb, block_group, NULL); if (!desc) - goto error_out; - if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { - bh = sb_getblk(sb, ext4_inode_bitmap(sb, desc)); - if (!buffer_uptodate(bh)) { - lock_buffer(bh); - if (!buffer_uptodate(bh)) { - ext4_init_inode_bitmap(sb, bh, block_group, - desc); - set_buffer_uptodate(bh); - } - unlock_buffer(bh); - } - } else { - bh = sb_bread(sb, ext4_inode_bitmap(sb, desc)); + return NULL; + bitmap_blk = ext4_inode_bitmap(sb, desc); + bh = sb_getblk(sb, bitmap_blk); + if (unlikely(!bh)) { + ext4_error(sb, __func__, + "Cannot read inode bitmap - " + "block_group = %lu, inode_bitmap = %llu", + block_group, bitmap_blk); + return NULL; } - if (!bh) - ext4_error(sb, "read_inode_bitmap", + if (bh_uptodate_or_lock(bh)) + return bh; + + spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group)); + if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { + ext4_init_inode_bitmap(sb, bh, block_group, desc); + set_buffer_uptodate(bh); + unlock_buffer(bh); + spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); + return bh; + } + spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); + if (bh_submit_read(bh) < 0) { + put_bh(bh); + ext4_error(sb, __func__, "Cannot read inode bitmap - " "block_group = %lu, inode_bitmap = %llu", - block_group, ext4_inode_bitmap(sb, desc)); -error_out: + block_group, bitmap_blk); + return NULL; + } return bh; } @@ -200,7 +210,7 @@ void ext4_free_inode (handle_t *handle, struct inode * inode) } block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); - bitmap_bh = read_inode_bitmap(sb, block_group); + bitmap_bh = ext4_read_inode_bitmap(sb, block_group); if (!bitmap_bh) goto error_return; @@ -341,7 +351,7 @@ find_close_to_parent: goto found_flexbg; } - if (best_flex < 0 || + if (flex_group[best_flex].free_inodes == 0 || (flex_group[i].free_blocks > flex_group[best_flex].free_blocks && flex_group[i].free_inodes)) @@ -623,7 +633,7 @@ got_group: goto fail; brelse(bitmap_bh); - bitmap_bh = read_inode_bitmap(sb, group); + bitmap_bh = ext4_read_inode_bitmap(sb, group); if (!bitmap_bh) goto fail; @@ -728,7 +738,7 @@ got: /* When marking the block group with * ~EXT4_BG_INODE_UNINIT we don't want to depend - * on the value of bg_itable_unsed even though + * on the value of bg_itable_unused even though * mke2fs could have initialized the same for us. * Instead we calculated the value below */ @@ -891,7 +901,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); - bitmap_bh = read_inode_bitmap(sb, block_group); + bitmap_bh = ext4_read_inode_bitmap(sb, block_group); if (!bitmap_bh) { ext4_warning(sb, __func__, "inode bitmap error for orphan %lu", ino); @@ -969,7 +979,7 @@ unsigned long ext4_count_free_inodes (struct super_block * sb) continue; desc_count += le16_to_cpu(gdp->bg_free_inodes_count); brelse(bitmap_bh); - bitmap_bh = read_inode_bitmap(sb, i); + bitmap_bh = ext4_read_inode_bitmap(sb, i); if (!bitmap_bh) continue; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9843b04..7e91913e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -41,6 +41,8 @@ #include "acl.h" #include "ext4_extents.h" +#define MPAGE_DA_EXTENT_TAIL 0x01 + static inline int ext4_begin_ordered_truncate(struct inode *inode, loff_t new_size) { @@ -191,6 +193,7 @@ static int ext4_journal_test_restart(handle_t *handle, struct inode *inode) void ext4_delete_inode (struct inode * inode) { handle_t *handle; + int err; if (ext4_should_order_data(inode)) ext4_begin_ordered_truncate(inode, 0); @@ -199,8 +202,9 @@ void ext4_delete_inode (struct inode * inode) if (is_bad_inode(inode)) goto no_delete; - handle = start_transaction(inode); + handle = ext4_journal_start(inode, blocks_for_truncate(inode)+3); if (IS_ERR(handle)) { + ext4_std_error(inode->i_sb, PTR_ERR(handle)); /* * If we're going to skip the normal cleanup, we still need to * make sure that the in-core orphan linked list is properly @@ -213,8 +217,34 @@ void ext4_delete_inode (struct inode * inode) if (IS_SYNC(inode)) handle->h_sync = 1; inode->i_size = 0; + err = ext4_mark_inode_dirty(handle, inode); + if (err) { + ext4_warning(inode->i_sb, __func__, + "couldn't mark inode dirty (err %d)", err); + goto stop_handle; + } if (inode->i_blocks) ext4_truncate(inode); + + /* + * ext4_ext_truncate() doesn't reserve any slop when it + * restarts journal transactions; therefore there may not be + * enough credits left in the handle to remove the inode from + * the orphan list and set the dtime field. + */ + if (handle->h_buffer_credits < 3) { + err = ext4_journal_extend(handle, 3); + if (err > 0) + err = ext4_journal_restart(handle, 3); + if (err != 0) { + ext4_warning(inode->i_sb, __func__, + "couldn't extend journal (err %d)", err); + stop_handle: + ext4_journal_stop(handle); + goto no_delete; + } + } + /* * Kill off the orphan record which ext4_truncate created. * AKPM: I think this can be inside the above `if'. @@ -952,23 +982,74 @@ out: return err; } -/* Maximum number of blocks we map for direct IO at once. */ -#define DIO_MAX_BLOCKS 4096 /* - * Number of credits we need for writing DIO_MAX_BLOCKS: - * We need sb + group descriptor + bitmap + inode -> 4 - * For B blocks with A block pointers per block we need: - * 1 (triple ind.) + (B/A/A + 2) (doubly ind.) + (B/A + 2) (indirect). - * If we plug in 4096 for B and 256 for A (for 1KB block size), we get 25. + * Calculate the number of metadata blocks need to reserve + * to allocate @blocks for non extent file based file */ -#define DIO_CREDITS 25 +static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks) +{ + int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb); + int ind_blks, dind_blks, tind_blks; + + /* number of new indirect blocks needed */ + ind_blks = (blocks + icap - 1) / icap; + + dind_blks = (ind_blks + icap - 1) / icap; + tind_blks = 1; + + return ind_blks + dind_blks + tind_blks; +} /* + * Calculate the number of metadata blocks need to reserve + * to allocate given number of blocks + */ +static int ext4_calc_metadata_amount(struct inode *inode, int blocks) +{ + if (!blocks) + return 0; + + if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) + return ext4_ext_calc_metadata_amount(inode, blocks); + + return ext4_indirect_calc_metadata_amount(inode, blocks); +} + +static void ext4_da_update_reserve_space(struct inode *inode, int used) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + int total, mdb, mdb_free; + + spin_lock(&EXT4_I(inode)->i_block_reservation_lock); + /* recalculate the number of metablocks still need to be reserved */ + total = EXT4_I(inode)->i_reserved_data_blocks - used; + mdb = ext4_calc_metadata_amount(inode, total); + + /* figure out how many metablocks to release */ + BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); + mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; + + /* Account for allocated meta_blocks */ + mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks; + + /* update fs free blocks counter for truncate case */ + percpu_counter_add(&sbi->s_freeblocks_counter, mdb_free); + + /* update per-inode reservations */ + BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks); + EXT4_I(inode)->i_reserved_data_blocks -= used; + + BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); + EXT4_I(inode)->i_reserved_meta_blocks = mdb; + EXT4_I(inode)->i_allocated_meta_blocks = 0; + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); +} + +/* + * The ext4_get_blocks_wrap() function try to look up the requested blocks, + * and returns if the blocks are already mapped. * - * - * ext4_ext4 get_block() wrapper function - * It will do a look up first, and returns if the blocks already mapped. * Otherwise it takes the write lock of the i_data_sem and allocate blocks * and store the allocated blocks in the result buffer head and mark it * mapped. @@ -1069,26 +1150,30 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, * which were deferred till now */ if ((retval > 0) && buffer_delay(bh)) - ext4_da_release_space(inode, retval, 0); + ext4_da_update_reserve_space(inode, retval); } up_write((&EXT4_I(inode)->i_data_sem)); return retval; } +/* Maximum number of blocks we map for direct IO at once. */ +#define DIO_MAX_BLOCKS 4096 + static int ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { handle_t *handle = ext4_journal_current_handle(); int ret = 0, started = 0; unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; + int dio_credits; if (create && !handle) { /* Direct IO write... */ if (max_blocks > DIO_MAX_BLOCKS) max_blocks = DIO_MAX_BLOCKS; - handle = ext4_journal_start(inode, DIO_CREDITS + - 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb)); + dio_credits = ext4_chunk_trans_blocks(inode, max_blocks); + handle = ext4_journal_start(inode, dio_credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out; @@ -1336,12 +1421,8 @@ static int ext4_ordered_write_end(struct file *file, { handle_t *handle = ext4_journal_current_handle(); struct inode *inode = mapping->host; - unsigned from, to; int ret = 0, ret2; - from = pos & (PAGE_CACHE_SIZE - 1); - to = from + len; - ret = ext4_jbd2_file_inode(handle, inode); if (ret == 0) { @@ -1437,36 +1518,6 @@ static int ext4_journalled_write_end(struct file *file, return ret ? ret : copied; } -/* - * Calculate the number of metadata blocks need to reserve - * to allocate @blocks for non extent file based file - */ -static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks) -{ - int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb); - int ind_blks, dind_blks, tind_blks; - - /* number of new indirect blocks needed */ - ind_blks = (blocks + icap - 1) / icap; - - dind_blks = (ind_blks + icap - 1) / icap; - - tind_blks = 1; - - return ind_blks + dind_blks + tind_blks; -} - -/* - * Calculate the number of metadata blocks need to reserve - * to allocate given number of blocks - */ -static int ext4_calc_metadata_amount(struct inode *inode, int blocks) -{ - if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) - return ext4_ext_calc_metadata_amount(inode, blocks); - - return ext4_indirect_calc_metadata_amount(inode, blocks); -} static int ext4_da_reserve_space(struct inode *inode, int nrblocks) { @@ -1490,7 +1541,6 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks) spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); return -ENOSPC; } - /* reduce fs free blocks counter */ percpu_counter_sub(&sbi->s_freeblocks_counter, total); @@ -1501,35 +1551,49 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks) return 0; /* success */ } -void ext4_da_release_space(struct inode *inode, int used, int to_free) +static void ext4_da_release_space(struct inode *inode, int to_free) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int total, mdb, mdb_free, release; + if (!to_free) + return; /* Nothing to release, exit */ + spin_lock(&EXT4_I(inode)->i_block_reservation_lock); + + if (!EXT4_I(inode)->i_reserved_data_blocks) { + /* + * if there is no reserved blocks, but we try to free some + * then the counter is messed up somewhere. + * but since this function is called from invalidate + * page, it's harmless to return without any action + */ + printk(KERN_INFO "ext4 delalloc try to release %d reserved " + "blocks for inode %lu, but there is no reserved " + "data blocks\n", to_free, inode->i_ino); + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + return; + } + /* recalculate the number of metablocks still need to be reserved */ - total = EXT4_I(inode)->i_reserved_data_blocks - used - to_free; + total = EXT4_I(inode)->i_reserved_data_blocks - to_free; mdb = ext4_calc_metadata_amount(inode, total); /* figure out how many metablocks to release */ BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; - /* Account for allocated meta_blocks */ - mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks; - release = to_free + mdb_free; /* update fs free blocks counter for truncate case */ percpu_counter_add(&sbi->s_freeblocks_counter, release); /* update per-inode reservations */ - BUG_ON(used + to_free > EXT4_I(inode)->i_reserved_data_blocks); - EXT4_I(inode)->i_reserved_data_blocks -= (used + to_free); + BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks); + EXT4_I(inode)->i_reserved_data_blocks -= to_free; BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); EXT4_I(inode)->i_reserved_meta_blocks = mdb; - EXT4_I(inode)->i_allocated_meta_blocks = 0; spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); } @@ -1551,7 +1615,7 @@ static void ext4_da_page_release_reservation(struct page *page, } curr_off = next_off; } while ((bh = bh->b_this_page) != head); - ext4_da_release_space(page->mapping->host, 0, to_release); + ext4_da_release_space(page->mapping->host, to_release); } /* @@ -1564,11 +1628,13 @@ struct mpage_da_data { unsigned long first_page, next_page; /* extent of pages */ get_block_t *get_block; struct writeback_control *wbc; + int io_done; + long pages_written; }; /* * mpage_da_submit_io - walks through extent of pages and try to write - * them with __mpage_writepage() + * them with writepage() call back * * @mpd->inode: inode * @mpd->first_page: first page of the extent @@ -1583,18 +1649,11 @@ struct mpage_da_data { static int mpage_da_submit_io(struct mpage_da_data *mpd) { struct address_space *mapping = mpd->inode->i_mapping; - struct mpage_data mpd_pp = { - .bio = NULL, - .last_block_in_bio = 0, - .get_block = mpd->get_block, - .use_writepage = 1, - }; int ret = 0, err, nr_pages, i; unsigned long index, end; struct pagevec pvec; BUG_ON(mpd->next_page <= mpd->first_page); - pagevec_init(&pvec, 0); index = mpd->first_page; end = mpd->next_page - 1; @@ -1612,8 +1671,9 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) break; index++; - err = __mpage_writepage(page, mpd->wbc, &mpd_pp); - + err = mapping->a_ops->writepage(page, mpd->wbc); + if (!err) + mpd->pages_written++; /* * In error case, we have to continue because * remaining pages are still locked @@ -1624,9 +1684,6 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) } pagevec_release(&pvec); } - if (mpd_pp.bio) - mpage_bio_submit(WRITE, mpd_pp.bio); - return ret; } @@ -1649,7 +1706,7 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, int blocks = exbh->b_size >> inode->i_blkbits; sector_t pblock = exbh->b_blocknr, cur_logical; struct buffer_head *head, *bh; - unsigned long index, end; + pgoff_t index, end; struct pagevec pvec; int nr_pages, i; @@ -1692,6 +1749,13 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, if (buffer_delay(bh)) { bh->b_blocknr = pblock; clear_buffer_delay(bh); + bh->b_bdev = inode->i_sb->s_bdev; + } else if (buffer_unwritten(bh)) { + bh->b_blocknr = pblock; + clear_buffer_unwritten(bh); + set_buffer_mapped(bh); + set_buffer_new(bh); + bh->b_bdev = inode->i_sb->s_bdev; } else if (buffer_mapped(bh)) BUG_ON(bh->b_blocknr != pblock); @@ -1727,13 +1791,11 @@ static inline void __unmap_underlying_blocks(struct inode *inode, * * The function skips space we know is already mapped to disk blocks. * - * The function ignores errors ->get_block() returns, thus real - * error handling is postponed to __mpage_writepage() */ static void mpage_da_map_blocks(struct mpage_da_data *mpd) { + int err = 0; struct buffer_head *lbh = &mpd->lbh; - int err = 0, remain = lbh->b_size; sector_t next = lbh->b_blocknr; struct buffer_head new; @@ -1743,38 +1805,36 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd) if (buffer_mapped(lbh) && !buffer_delay(lbh)) return; - while (remain) { - new.b_state = lbh->b_state; - new.b_blocknr = 0; - new.b_size = remain; - err = mpd->get_block(mpd->inode, next, &new, 1); - if (err) { - /* - * Rather than implement own error handling - * here, we just leave remaining blocks - * unallocated and try again with ->writepage() - */ - break; - } - BUG_ON(new.b_size == 0); + new.b_state = lbh->b_state; + new.b_blocknr = 0; + new.b_size = lbh->b_size; - if (buffer_new(&new)) - __unmap_underlying_blocks(mpd->inode, &new); + /* + * If we didn't accumulate anything + * to write simply return + */ + if (!new.b_size) + return; + err = mpd->get_block(mpd->inode, next, &new, 1); + if (err) + return; + BUG_ON(new.b_size == 0); - /* - * If blocks are delayed marked, we need to - * put actual blocknr and drop delayed bit - */ - if (buffer_delay(lbh)) - mpage_put_bnr_to_bhs(mpd, next, &new); + if (buffer_new(&new)) + __unmap_underlying_blocks(mpd->inode, &new); - /* go for the remaining blocks */ - next += new.b_size >> mpd->inode->i_blkbits; - remain -= new.b_size; - } + /* + * If blocks are delayed marked, we need to + * put actual blocknr and drop delayed bit + */ + if (buffer_delay(lbh) || buffer_unwritten(lbh)) + mpage_put_bnr_to_bhs(mpd, next, &new); + + return; } -#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay)) +#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \ + (1 << BH_Delay) | (1 << BH_Unwritten)) /* * mpage_add_bh_to_extent - try to add one more block to extent of blocks @@ -1788,41 +1848,61 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd) static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, sector_t logical, struct buffer_head *bh) { - struct buffer_head *lbh = &mpd->lbh; sector_t next; + size_t b_size = bh->b_size; + struct buffer_head *lbh = &mpd->lbh; + int nrblocks = lbh->b_size >> mpd->inode->i_blkbits; - next = lbh->b_blocknr + (lbh->b_size >> mpd->inode->i_blkbits); - + /* check if thereserved journal credits might overflow */ + if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) { + if (nrblocks >= EXT4_MAX_TRANS_DATA) { + /* + * With non-extent format we are limited by the journal + * credit available. Total credit needed to insert + * nrblocks contiguous blocks is dependent on the + * nrblocks. So limit nrblocks. + */ + goto flush_it; + } else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) > + EXT4_MAX_TRANS_DATA) { + /* + * Adding the new buffer_head would make it cross the + * allowed limit for which we have journal credit + * reserved. So limit the new bh->b_size + */ + b_size = (EXT4_MAX_TRANS_DATA - nrblocks) << + mpd->inode->i_blkbits; + /* we will do mpage_da_submit_io in the next loop */ + } + } /* * First block in the extent */ if (lbh->b_size == 0) { lbh->b_blocknr = logical; - lbh->b_size = bh->b_size; + lbh->b_size = b_size; lbh->b_state = bh->b_state & BH_FLAGS; return; } + next = lbh->b_blocknr + nrblocks; /* * Can we merge the block to our big extent? */ if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) { - lbh->b_size += bh->b_size; + lbh->b_size += b_size; return; } +flush_it: /* * We couldn't merge the block to our extent, so we * need to flush current extent and start new one */ mpage_da_map_blocks(mpd); - - /* - * Now start a new extent - */ - lbh->b_size = bh->b_size; - lbh->b_state = bh->b_state & BH_FLAGS; - lbh->b_blocknr = logical; + mpage_da_submit_io(mpd); + mpd->io_done = 1; + return; } /* @@ -1842,17 +1922,35 @@ static int __mpage_da_writepage(struct page *page, struct buffer_head *bh, *head, fake; sector_t logical; + if (mpd->io_done) { + /* + * Rest of the page in the page_vec + * redirty then and skip then. We will + * try to to write them again after + * starting a new transaction + */ + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return MPAGE_DA_EXTENT_TAIL; + } /* * Can we merge this page to current extent? */ if (mpd->next_page != page->index) { /* * Nope, we can't. So, we map non-allocated blocks - * and start IO on them using __mpage_writepage() + * and start IO on them using writepage() */ if (mpd->next_page != mpd->first_page) { mpage_da_map_blocks(mpd); mpage_da_submit_io(mpd); + /* + * skip rest of the page in the page_vec + */ + mpd->io_done = 1; + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return MPAGE_DA_EXTENT_TAIL; } /* @@ -1883,6 +1981,8 @@ static int __mpage_da_writepage(struct page *page, set_buffer_dirty(bh); set_buffer_uptodate(bh); mpage_add_bh_to_extent(mpd, logical, bh); + if (mpd->io_done) + return MPAGE_DA_EXTENT_TAIL; } else { /* * Page with regular buffer heads, just add all dirty ones @@ -1891,8 +1991,12 @@ static int __mpage_da_writepage(struct page *page, bh = head; do { BUG_ON(buffer_locked(bh)); - if (buffer_dirty(bh)) + if (buffer_dirty(bh) && + (!buffer_mapped(bh) || buffer_delay(bh))) { mpage_add_bh_to_extent(mpd, logical, bh); + if (mpd->io_done) + return MPAGE_DA_EXTENT_TAIL; + } logical++; } while ((bh = bh->b_this_page) != head); } @@ -1911,22 +2015,13 @@ static int __mpage_da_writepage(struct page *page, * * This is a library function, which implements the writepages() * address_space_operation. - * - * In order to avoid duplication of logic that deals with partial pages, - * multiple bio per page, etc, we find non-allocated blocks, allocate - * them with minimal calls to ->get_block() and re-use __mpage_writepage() - * - * It's important that we call __mpage_writepage() only once for each - * involved page, otherwise we'd have to implement more complicated logic - * to deal with pages w/o PG_lock or w/ PG_writeback and so on. - * - * See comments to mpage_writepages() */ static int mpage_da_writepages(struct address_space *mapping, struct writeback_control *wbc, get_block_t get_block) { struct mpage_da_data mpd; + long to_write; int ret; if (!get_block) @@ -1940,17 +2035,22 @@ static int mpage_da_writepages(struct address_space *mapping, mpd.first_page = 0; mpd.next_page = 0; mpd.get_block = get_block; + mpd.io_done = 0; + mpd.pages_written = 0; + + to_write = wbc->nr_to_write; ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd); /* * Handle last extent of pages */ - if (mpd.next_page != mpd.first_page) { + if (!mpd.io_done && mpd.next_page != mpd.first_page) { mpage_da_map_blocks(&mpd); mpage_da_submit_io(&mpd); } + wbc->nr_to_write = to_write - mpd.pages_written; return ret; } @@ -2155,63 +2255,95 @@ static int ext4_da_writepage(struct page *page, } /* - * For now just follow the DIO way to estimate the max credits - * needed to write out EXT4_MAX_WRITEBACK_PAGES. - * todo: need to calculate the max credits need for - * extent based files, currently the DIO credits is based on - * indirect-blocks mapping way. - * - * Probably should have a generic way to calculate credits - * for DIO, writepages, and truncate + * This is called via ext4_da_writepages() to + * calulate the total number of credits to reserve to fit + * a single extent allocation into a single transaction, + * ext4_da_writpeages() will loop calling this before + * the block allocation. */ -#define EXT4_MAX_WRITEBACK_PAGES DIO_MAX_BLOCKS -#define EXT4_MAX_WRITEBACK_CREDITS DIO_CREDITS + +static int ext4_da_writepages_trans_blocks(struct inode *inode) +{ + int max_blocks = EXT4_I(inode)->i_reserved_data_blocks; + + /* + * With non-extent format the journal credit needed to + * insert nrblocks contiguous block is dependent on + * number of contiguous block. So we will limit + * number of contiguous block to a sane value + */ + if (!(inode->i_flags & EXT4_EXTENTS_FL) && + (max_blocks > EXT4_MAX_TRANS_DATA)) + max_blocks = EXT4_MAX_TRANS_DATA; + + return ext4_chunk_trans_blocks(inode, max_blocks); +} static int ext4_da_writepages(struct address_space *mapping, - struct writeback_control *wbc) + struct writeback_control *wbc) { - struct inode *inode = mapping->host; handle_t *handle = NULL; - int needed_blocks; - int ret = 0; - long to_write; loff_t range_start = 0; + struct inode *inode = mapping->host; + int needed_blocks, ret = 0, nr_to_writebump = 0; + long to_write, pages_skipped = 0; + struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); /* * No pages to write? This is mainly a kludge to avoid starting * a transaction for special inodes like journal inode on last iput() * because that could violate lock ordering on umount */ - if (!mapping->nrpages) + if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) return 0; - /* - * Estimate the worse case needed credits to write out - * EXT4_MAX_BUF_BLOCKS pages + * Make sure nr_to_write is >= sbi->s_mb_stream_request + * This make sure small files blocks are allocated in + * single attempt. This ensure that small files + * get less fragmented. */ - needed_blocks = EXT4_MAX_WRITEBACK_CREDITS; + if (wbc->nr_to_write < sbi->s_mb_stream_request) { + nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write; + wbc->nr_to_write = sbi->s_mb_stream_request; + } - to_write = wbc->nr_to_write; - if (!wbc->range_cyclic) { + if (!wbc->range_cyclic) /* * If range_cyclic is not set force range_cont * and save the old writeback_index */ wbc->range_cont = 1; - range_start = wbc->range_start; - } - while (!ret && to_write) { + range_start = wbc->range_start; + pages_skipped = wbc->pages_skipped; + +restart_loop: + to_write = wbc->nr_to_write; + while (!ret && to_write > 0) { + + /* + * we insert one extent at a time. So we need + * credit needed for single extent allocation. + * journalled mode is currently not supported + * by delalloc + */ + BUG_ON(ext4_should_journal_data(inode)); + needed_blocks = ext4_da_writepages_trans_blocks(inode); + /* start a new transaction*/ handle = ext4_journal_start(inode, needed_blocks); if (IS_ERR(handle)) { ret = PTR_ERR(handle); + printk(KERN_EMERG "%s: jbd2_start: " + "%ld pages, ino %lu; err %d\n", __func__, + wbc->nr_to_write, inode->i_ino, ret); + dump_stack(); goto out_writepages; } if (ext4_should_order_data(inode)) { /* * With ordered mode we need to add - * the inode to the journal handle + * the inode to the journal handl * when we do block allocation. */ ret = ext4_jbd2_file_inode(handle, inode); @@ -2219,20 +2351,20 @@ static int ext4_da_writepages(struct address_space *mapping, ext4_journal_stop(handle); goto out_writepages; } - } - /* - * set the max dirty pages could be write at a time - * to fit into the reserved transaction credits - */ - if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES) - wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES; to_write -= wbc->nr_to_write; ret = mpage_da_writepages(mapping, wbc, - ext4_da_get_block_write); + ext4_da_get_block_write); ext4_journal_stop(handle); - if (wbc->nr_to_write) { + if (ret == MPAGE_DA_EXTENT_TAIL) { + /* + * got one extent now try with + * rest of the pages + */ + to_write += wbc->nr_to_write; + ret = 0; + } else if (wbc->nr_to_write) { /* * There is no more writeout needed * or we requested for a noblocking writeout @@ -2244,10 +2376,18 @@ static int ext4_da_writepages(struct address_space *mapping, wbc->nr_to_write = to_write; } -out_writepages: - wbc->nr_to_write = to_write; - if (range_start) + if (wbc->range_cont && (pages_skipped != wbc->pages_skipped)) { + /* We skipped pages in this loop */ wbc->range_start = range_start; + wbc->nr_to_write = to_write + + wbc->pages_skipped - pages_skipped; + wbc->pages_skipped = pages_skipped; + goto restart_loop; + } + +out_writepages: + wbc->nr_to_write = to_write - nr_to_writebump; + wbc->range_start = range_start; return ret; } @@ -2280,8 +2420,11 @@ retry: } page = __grab_cache_page(mapping, index); - if (!page) - return -ENOMEM; + if (!page) { + ext4_journal_stop(handle); + ret = -ENOMEM; + goto out; + } *pagep = page; ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, @@ -3434,6 +3577,9 @@ void ext4_truncate(struct inode *inode) * modify the block allocation tree. */ down_write(&ei->i_data_sem); + + ext4_discard_reservation(inode); + /* * The orphan list entry will now protect us from any crash which * occurs before the truncate completes, so it is now safe to propagate @@ -3503,8 +3649,6 @@ do_indirects: ; } - ext4_discard_reservation(inode); - up_write(&ei->i_data_sem); inode->i_mtime = inode->i_ctime = ext4_current_time(inode); ext4_mark_inode_dirty(handle, inode); @@ -3590,6 +3734,16 @@ static int __ext4_get_inode_loc(struct inode *inode, } if (!buffer_uptodate(bh)) { lock_buffer(bh); + + /* + * If the buffer has the write error flag, we have failed + * to write out another inode in the same block. In this + * case, we don't have to read the block because we may + * read the old inode data successfully. + */ + if (buffer_write_io_error(bh) && !buffer_uptodate(bh)) + set_buffer_uptodate(bh); + if (buffer_uptodate(bh)) { /* someone brought it uptodate while we waited */ unlock_buffer(bh); @@ -4262,57 +4416,129 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, return 0; } +static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks, + int chunk) +{ + int indirects; + + /* if nrblocks are contiguous */ + if (chunk) { + /* + * With N contiguous data blocks, it need at most + * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) indirect blocks + * 2 dindirect blocks + * 1 tindirect block + */ + indirects = nrblocks / EXT4_ADDR_PER_BLOCK(inode->i_sb); + return indirects + 3; + } + /* + * if nrblocks are not contiguous, worse case, each block touch + * a indirect block, and each indirect block touch a double indirect + * block, plus a triple indirect block + */ + indirects = nrblocks * 2 + 1; + return indirects; +} + +static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) +{ + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) + return ext4_indirect_trans_blocks(inode, nrblocks, 0); + return ext4_ext_index_trans_blocks(inode, nrblocks, 0); +} /* - * How many blocks doth make a writepage()? + * Account for index blocks, block groups bitmaps and block group + * descriptor blocks if modify datablocks and index blocks + * worse case, the indexs blocks spread over different block groups * - * With N blocks per page, it may be: - * N data blocks - * 2 indirect block - * 2 dindirect - * 1 tindirect - * N+5 bitmap blocks (from the above) - * N+5 group descriptor summary blocks - * 1 inode block - * 1 superblock. - * 2 * EXT4_SINGLEDATA_TRANS_BLOCKS for the quote files + * If datablocks are discontiguous, they are possible to spread over + * different block groups too. If they are contiugous, with flexbg, + * they could still across block group boundary. * - * 3 * (N + 5) + 2 + 2 * EXT4_SINGLEDATA_TRANS_BLOCKS - * - * With ordered or writeback data it's the same, less the N data blocks. + * Also account for superblock, inode, quota and xattr blocks + */ +int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) +{ + int groups, gdpblocks; + int idxblocks; + int ret = 0; + + /* + * How many index blocks need to touch to modify nrblocks? + * The "Chunk" flag indicating whether the nrblocks is + * physically contiguous on disk + * + * For Direct IO and fallocate, they calls get_block to allocate + * one single extent at a time, so they could set the "Chunk" flag + */ + idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk); + + ret = idxblocks; + + /* + * Now let's see how many group bitmaps and group descriptors need + * to account + */ + groups = idxblocks; + if (chunk) + groups += 1; + else + groups += nrblocks; + + gdpblocks = groups; + if (groups > EXT4_SB(inode->i_sb)->s_groups_count) + groups = EXT4_SB(inode->i_sb)->s_groups_count; + if (groups > EXT4_SB(inode->i_sb)->s_gdb_count) + gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count; + + /* bitmaps and block group descriptor blocks */ + ret += groups + gdpblocks; + + /* Blocks for super block, inode, quota and xattr blocks */ + ret += EXT4_META_TRANS_BLOCKS(inode->i_sb); + + return ret; +} + +/* + * Calulate the total number of credits to reserve to fit + * the modification of a single pages into a single transaction, + * which may include multiple chunks of block allocations. * - * If the inode's direct blocks can hold an integral number of pages then a - * page cannot straddle two indirect blocks, and we can only touch one indirect - * and dindirect block, and the "5" above becomes "3". + * This could be called via ext4_write_begin() * - * This still overestimates under most circumstances. If we were to pass the - * start and end offsets in here as well we could do block_to_path() on each - * block and work out the exact number of indirects which are touched. Pah. + * We need to consider the worse case, when + * one new block per extent. */ - int ext4_writepage_trans_blocks(struct inode *inode) { int bpp = ext4_journal_blocks_per_page(inode); - int indirects = (EXT4_NDIR_BLOCKS % bpp) ? 5 : 3; int ret; - if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) - return ext4_ext_writepage_trans_blocks(inode, bpp); + ret = ext4_meta_trans_blocks(inode, bpp, 0); + /* Account for data blocks for journalled mode */ if (ext4_should_journal_data(inode)) - ret = 3 * (bpp + indirects) + 2; - else - ret = 2 * (bpp + indirects) + 2; - -#ifdef CONFIG_QUOTA - /* We know that structure was already allocated during DQUOT_INIT so - * we will be updating only the data blocks + inodes */ - ret += 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); -#endif - + ret += bpp; return ret; } /* + * Calculate the journal credits for a chunk of data modification. + * + * This is called from DIO, fallocate or whoever calling + * ext4_get_blocks_wrap() to map/allocate a chunk of contigous disk blocks. + * + * journal buffers for data blocks are not included here, as DIO + * and fallocate do no need to journal data buffers. + */ +int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks) +{ + return ext4_meta_trans_blocks(inode, nrblocks, 1); +} + +/* * The caller must have previously called ext4_reserve_inode_write(). * Give this, we know that the caller already has write access to iloc->bh. */ diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 8d141a2..e0e3a5e 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -787,13 +787,16 @@ static int ext4_mb_init_cache(struct page *page, char *incore) if (bh_uptodate_or_lock(bh[i])) continue; + spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { ext4_init_block_bitmap(sb, bh[i], first_group + i, desc); set_buffer_uptodate(bh[i]); unlock_buffer(bh[i]); + spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); continue; } + spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); get_bh(bh[i]); bh[i]->b_end_io = end_buffer_read_sync; submit_bh(READ, bh[i]); @@ -2477,7 +2480,7 @@ err_freesgi: int ext4_mb_init(struct super_block *sb, int needs_recovery) { struct ext4_sb_info *sbi = EXT4_SB(sb); - unsigned i; + unsigned i, j; unsigned offset; unsigned max; int ret; @@ -2537,7 +2540,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT; sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC; - i = sizeof(struct ext4_locality_group) * NR_CPUS; + i = sizeof(struct ext4_locality_group) * nr_cpu_ids; sbi->s_locality_groups = kmalloc(i, GFP_KERNEL); if (sbi->s_locality_groups == NULL) { clear_opt(sbi->s_mount_opt, MBALLOC); @@ -2545,11 +2548,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) kfree(sbi->s_mb_maxs); return -ENOMEM; } - for (i = 0; i < NR_CPUS; i++) { + for (i = 0; i < nr_cpu_ids; i++) { struct ext4_locality_group *lg; lg = &sbi->s_locality_groups[i]; mutex_init(&lg->lg_mutex); - INIT_LIST_HEAD(&lg->lg_prealloc_list); + for (j = 0; j < PREALLOC_TB_SIZE; j++) + INIT_LIST_HEAD(&lg->lg_prealloc_list[j]); spin_lock_init(&lg->lg_prealloc_lock); } @@ -3260,6 +3264,7 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac, struct ext4_prealloc_space *pa) { unsigned int len = ac->ac_o_ex.fe_len; + ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart, &ac->ac_b_ex.fe_group, &ac->ac_b_ex.fe_start); @@ -3277,14 +3282,45 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac, } /* + * Return the prealloc space that have minimal distance + * from the goal block. @cpa is the prealloc + * space that is having currently known minimal distance + * from the goal block. + */ +static struct ext4_prealloc_space * +ext4_mb_check_group_pa(ext4_fsblk_t goal_block, + struct ext4_prealloc_space *pa, + struct ext4_prealloc_space *cpa) +{ + ext4_fsblk_t cur_distance, new_distance; + + if (cpa == NULL) { + atomic_inc(&pa->pa_count); + return pa; + } + cur_distance = abs(goal_block - cpa->pa_pstart); + new_distance = abs(goal_block - pa->pa_pstart); + + if (cur_distance < new_distance) + return cpa; + + /* drop the previous reference */ + atomic_dec(&cpa->pa_count); + atomic_inc(&pa->pa_count); + return pa; +} + +/* * search goal blocks in preallocated space */ static noinline_for_stack int ext4_mb_use_preallocated(struct ext4_allocation_context *ac) { + int order, i; struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_locality_group *lg; - struct ext4_prealloc_space *pa; + struct ext4_prealloc_space *pa, *cpa = NULL; + ext4_fsblk_t goal_block; /* only data can be preallocated */ if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) @@ -3322,22 +3358,38 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) lg = ac->ac_lg; if (lg == NULL) return 0; + order = fls(ac->ac_o_ex.fe_len) - 1; + if (order > PREALLOC_TB_SIZE - 1) + /* The max size of hash table is PREALLOC_TB_SIZE */ + order = PREALLOC_TB_SIZE - 1; + + goal_block = ac->ac_g_ex.fe_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb) + + ac->ac_g_ex.fe_start + + le32_to_cpu(EXT4_SB(ac->ac_sb)->s_es->s_first_data_block); + /* + * search for the prealloc space that is having + * minimal distance from the goal block. + */ + for (i = order; i < PREALLOC_TB_SIZE; i++) { + rcu_read_lock(); + list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i], + pa_inode_list) { + spin_lock(&pa->pa_lock); + if (pa->pa_deleted == 0 && + pa->pa_free >= ac->ac_o_ex.fe_len) { - rcu_read_lock(); - list_for_each_entry_rcu(pa, &lg->lg_prealloc_list, pa_inode_list) { - spin_lock(&pa->pa_lock); - if (pa->pa_deleted == 0 && pa->pa_free >= ac->ac_o_ex.fe_len) { - atomic_inc(&pa->pa_count); - ext4_mb_use_group_pa(ac, pa); + cpa = ext4_mb_check_group_pa(goal_block, + pa, cpa); + } spin_unlock(&pa->pa_lock); - ac->ac_criteria = 20; - rcu_read_unlock(); - return 1; } - spin_unlock(&pa->pa_lock); + rcu_read_unlock(); + } + if (cpa) { + ext4_mb_use_group_pa(ac, cpa); + ac->ac_criteria = 20; + return 1; } - rcu_read_unlock(); - return 0; } @@ -3560,6 +3612,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac) pa->pa_free = pa->pa_len; atomic_set(&pa->pa_count, 1); spin_lock_init(&pa->pa_lock); + INIT_LIST_HEAD(&pa->pa_inode_list); pa->pa_deleted = 0; pa->pa_linear = 1; @@ -3580,10 +3633,10 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac) list_add(&pa->pa_group_list, &grp->bb_prealloc_list); ext4_unlock_group(sb, ac->ac_b_ex.fe_group); - spin_lock(pa->pa_obj_lock); - list_add_tail_rcu(&pa->pa_inode_list, &lg->lg_prealloc_list); - spin_unlock(pa->pa_obj_lock); - + /* + * We will later add the new pa to the right bucket + * after updating the pa_free in ext4_mb_release_context + */ return 0; } @@ -3733,20 +3786,23 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, bitmap_bh = ext4_read_block_bitmap(sb, group); if (bitmap_bh == NULL) { - /* error handling here */ - ext4_mb_release_desc(&e4b); - BUG_ON(bitmap_bh == NULL); + ext4_error(sb, __func__, "Error in reading block " + "bitmap for %lu\n", group); + return 0; } err = ext4_mb_load_buddy(sb, group, &e4b); - BUG_ON(err != 0); /* error handling here */ + if (err) { + ext4_error(sb, __func__, "Error in loading buddy " + "information for %lu\n", group); + put_bh(bitmap_bh); + return 0; + } if (needed == 0) needed = EXT4_BLOCKS_PER_GROUP(sb) + 1; - grp = ext4_get_group_info(sb, group); INIT_LIST_HEAD(&list); - ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); repeat: ext4_lock_group(sb, group); @@ -3903,13 +3959,18 @@ repeat: ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); err = ext4_mb_load_buddy(sb, group, &e4b); - BUG_ON(err != 0); /* error handling here */ + if (err) { + ext4_error(sb, __func__, "Error in loading buddy " + "information for %lu\n", group); + continue; + } bitmap_bh = ext4_read_block_bitmap(sb, group); if (bitmap_bh == NULL) { - /* error handling here */ + ext4_error(sb, __func__, "Error in reading block " + "bitmap for %lu\n", group); ext4_mb_release_desc(&e4b); - BUG_ON(bitmap_bh == NULL); + continue; } ext4_lock_group(sb, group); @@ -4112,22 +4173,168 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, } +static noinline_for_stack void +ext4_mb_discard_lg_preallocations(struct super_block *sb, + struct ext4_locality_group *lg, + int order, int total_entries) +{ + ext4_group_t group = 0; + struct ext4_buddy e4b; + struct list_head discard_list; + struct ext4_prealloc_space *pa, *tmp; + struct ext4_allocation_context *ac; + + mb_debug("discard locality group preallocation\n"); + + INIT_LIST_HEAD(&discard_list); + ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); + + spin_lock(&lg->lg_prealloc_lock); + list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order], + pa_inode_list) { + spin_lock(&pa->pa_lock); + if (atomic_read(&pa->pa_count)) { + /* + * This is the pa that we just used + * for block allocation. So don't + * free that + */ + spin_unlock(&pa->pa_lock); + continue; + } + if (pa->pa_deleted) { + spin_unlock(&pa->pa_lock); + continue; + } + /* only lg prealloc space */ + BUG_ON(!pa->pa_linear); + + /* seems this one can be freed ... */ + pa->pa_deleted = 1; + spin_unlock(&pa->pa_lock); + + list_del_rcu(&pa->pa_inode_list); + list_add(&pa->u.pa_tmp_list, &discard_list); + + total_entries--; + if (total_entries <= 5) { + /* + * we want to keep only 5 entries + * allowing it to grow to 8. This + * mak sure we don't call discard + * soon for this list. + */ + break; + } + } + spin_unlock(&lg->lg_prealloc_lock); + + list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) { + + ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); + if (ext4_mb_load_buddy(sb, group, &e4b)) { + ext4_error(sb, __func__, "Error in loading buddy " + "information for %lu\n", group); + continue; + } + ext4_lock_group(sb, group); + list_del(&pa->pa_group_list); + ext4_mb_release_group_pa(&e4b, pa, ac); + ext4_unlock_group(sb, group); + + ext4_mb_release_desc(&e4b); + list_del(&pa->u.pa_tmp_list); + call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); + } + if (ac) + kmem_cache_free(ext4_ac_cachep, ac); +} + +/* + * We have incremented pa_count. So it cannot be freed at this + * point. Also we hold lg_mutex. So no parallel allocation is + * possible from this lg. That means pa_free cannot be updated. + * + * A parallel ext4_mb_discard_group_preallocations is possible. + * which can cause the lg_prealloc_list to be updated. + */ + +static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) +{ + int order, added = 0, lg_prealloc_count = 1; + struct super_block *sb = ac->ac_sb; + struct ext4_locality_group *lg = ac->ac_lg; + struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa; + + order = fls(pa->pa_free) - 1; + if (order > PREALLOC_TB_SIZE - 1) + /* The max size of hash table is PREALLOC_TB_SIZE */ + order = PREALLOC_TB_SIZE - 1; + /* Add the prealloc space to lg */ + rcu_read_lock(); + list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order], + pa_inode_list) { + spin_lock(&tmp_pa->pa_lock); + if (tmp_pa->pa_deleted) { + spin_unlock(&pa->pa_lock); + continue; + } + if (!added && pa->pa_free < tmp_pa->pa_free) { + /* Add to the tail of the previous entry */ + list_add_tail_rcu(&pa->pa_inode_list, + &tmp_pa->pa_inode_list); + added = 1; + /* + * we want to count the total + * number of entries in the list + */ + } + spin_unlock(&tmp_pa->pa_lock); + lg_prealloc_count++; + } + if (!added) + list_add_tail_rcu(&pa->pa_inode_list, + &lg->lg_prealloc_list[order]); + rcu_read_unlock(); + + /* Now trim the list to be not more than 8 elements */ + if (lg_prealloc_count > 8) { + ext4_mb_discard_lg_preallocations(sb, lg, + order, lg_prealloc_count); + return; + } + return ; +} + /* * release all resource we used in allocation */ static int ext4_mb_release_context(struct ext4_allocation_context *ac) { - if (ac->ac_pa) { - if (ac->ac_pa->pa_linear) { + struct ext4_prealloc_space *pa = ac->ac_pa; + if (pa) { + if (pa->pa_linear) { /* see comment in ext4_mb_use_group_pa() */ - spin_lock(&ac->ac_pa->pa_lock); - ac->ac_pa->pa_pstart += ac->ac_b_ex.fe_len; - ac->ac_pa->pa_lstart += ac->ac_b_ex.fe_len; - ac->ac_pa->pa_free -= ac->ac_b_ex.fe_len; - ac->ac_pa->pa_len -= ac->ac_b_ex.fe_len; - spin_unlock(&ac->ac_pa->pa_lock); + spin_lock(&pa->pa_lock); + pa->pa_pstart += ac->ac_b_ex.fe_len; + pa->pa_lstart += ac->ac_b_ex.fe_len; + pa->pa_free -= ac->ac_b_ex.fe_len; + pa->pa_len -= ac->ac_b_ex.fe_len; + spin_unlock(&pa->pa_lock); + /* + * We want to add the pa to the right bucket. + * Remove it from the list and while adding + * make sure the list to which we are adding + * doesn't grow big. + */ + if (likely(pa->pa_free)) { + spin_lock(pa->pa_obj_lock); + list_del_rcu(&pa->pa_inode_list); + spin_unlock(pa->pa_obj_lock); + ext4_mb_add_n_trim(ac); + } } - ext4_mb_put_pa(ac, ac->ac_sb, ac->ac_pa); + ext4_mb_put_pa(ac, ac->ac_sb, pa); } if (ac->ac_bitmap_page) page_cache_release(ac->ac_bitmap_page); @@ -4420,11 +4627,15 @@ do_more: count -= overflow; } bitmap_bh = ext4_read_block_bitmap(sb, block_group); - if (!bitmap_bh) + if (!bitmap_bh) { + err = -EIO; goto error_return; + } gdp = ext4_get_group_desc(sb, block_group, &gd_bh); - if (!gdp) + if (!gdp) { + err = -EIO; goto error_return; + } if (in_range(ext4_block_bitmap(sb, gdp), block, count) || in_range(ext4_inode_bitmap(sb, gdp), block, count) || diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index bfe6add..c7c9906 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -164,11 +164,17 @@ struct ext4_free_extent { * Locality group: * we try to group all related changes together * so that writeback can flush/allocate them together as well + * Size of lg_prealloc_list hash is determined by MB_DEFAULT_GROUP_PREALLOC + * (512). We store prealloc space into the hash based on the pa_free blocks + * order value.ie, fls(pa_free)-1; */ +#define PREALLOC_TB_SIZE 10 struct ext4_locality_group { /* for allocator */ - struct mutex lg_mutex; /* to serialize allocates */ - struct list_head lg_prealloc_list;/* list of preallocations */ + /* to serialize allocates */ + struct mutex lg_mutex; + /* list of preallocations */ + struct list_head lg_prealloc_list[PREALLOC_TB_SIZE]; spinlock_t lg_prealloc_lock; }; diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index b9e077b..46fc0b5 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -53,7 +53,8 @@ static int finish_range(handle_t *handle, struct inode *inode, * credit. But below we try to not accumalate too much * of them by restarting the journal. */ - needed = ext4_ext_calc_credits_for_insert(inode, path); + needed = ext4_ext_calc_credits_for_single_extent(inode, + lb->last_block - lb->first_block + 1, path); /* * Make sure the credit we accumalated is not really high diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index f000fbe..b3d3560 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -73,7 +73,7 @@ static int verify_group_input(struct super_block *sb, "Inode bitmap not in group (block %llu)", (unsigned long long)input->inode_bitmap); else if (outside(input->inode_table, start, end) || - outside(itend - 1, start, end)) + outside(itend - 1, start, end)) ext4_warning(sb, __func__, "Inode table not in group (blocks %llu-%llu)", (unsigned long long)input->inode_table, itend - 1); @@ -104,7 +104,7 @@ static int verify_group_input(struct super_block *sb, (unsigned long long)input->inode_bitmap, start, metaend - 1); else if (inside(input->inode_table, start, metaend) || - inside(itend - 1, start, metaend)) + inside(itend - 1, start, metaend)) ext4_warning(sb, __func__, "Inode table (%llu-%llu) overlaps" "GDT table (%llu-%llu)", @@ -158,9 +158,9 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh, if (err) { if ((err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA))) return err; - if ((err = ext4_journal_get_write_access(handle, bh))) + if ((err = ext4_journal_get_write_access(handle, bh))) return err; - } + } return 0; } @@ -416,11 +416,11 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, "EXT4-fs: ext4_add_new_gdb: adding group block %lu\n", gdb_num); - /* - * If we are not using the primary superblock/GDT copy don't resize, - * because the user tools have no way of handling this. Probably a - * bad time to do it anyways. - */ + /* + * If we are not using the primary superblock/GDT copy don't resize, + * because the user tools have no way of handling this. Probably a + * bad time to do it anyways. + */ if (EXT4_SB(sb)->s_sbh->b_blocknr != le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) { ext4_warning(sb, __func__, @@ -507,14 +507,14 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, return 0; exit_inode: - //ext4_journal_release_buffer(handle, iloc.bh); + /* ext4_journal_release_buffer(handle, iloc.bh); */ brelse(iloc.bh); exit_dindj: - //ext4_journal_release_buffer(handle, dind); + /* ext4_journal_release_buffer(handle, dind); */ exit_primary: - //ext4_journal_release_buffer(handle, *primary); + /* ext4_journal_release_buffer(handle, *primary); */ exit_sbh: - //ext4_journal_release_buffer(handle, *primary); + /* ext4_journal_release_buffer(handle, *primary); */ exit_dind: brelse(dind); exit_bh: @@ -773,7 +773,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) if (reserved_gdb || gdb_off == 0) { if (!EXT4_HAS_COMPAT_FEATURE(sb, - EXT4_FEATURE_COMPAT_RESIZE_INODE)){ + EXT4_FEATURE_COMPAT_RESIZE_INODE) + || !le16_to_cpu(es->s_reserved_gdt_blocks)) { ext4_warning(sb, __func__, "No reserved GDT blocks, can't resize"); return -EPERM; @@ -818,12 +819,12 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) if ((err = ext4_journal_get_write_access(handle, sbi->s_sbh))) goto exit_journal; - /* - * We will only either add reserved group blocks to a backup group - * or remove reserved blocks for the first group in a new group block. - * Doing both would be mean more complex code, and sane people don't - * use non-sparse filesystems anymore. This is already checked above. - */ + /* + * We will only either add reserved group blocks to a backup group + * or remove reserved blocks for the first group in a new group block. + * Doing both would be mean more complex code, and sane people don't + * use non-sparse filesystems anymore. This is already checked above. + */ if (gdb_off) { primary = sbi->s_group_desc[gdb_num]; if ((err = ext4_journal_get_write_access(handle, primary))) @@ -835,24 +836,24 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) } else if ((err = add_new_gdb(handle, inode, input, &primary))) goto exit_journal; - /* - * OK, now we've set up the new group. Time to make it active. - * - * Current kernels don't lock all allocations via lock_super(), - * so we have to be safe wrt. concurrent accesses the group - * data. So we need to be careful to set all of the relevant - * group descriptor data etc. *before* we enable the group. - * - * The key field here is sbi->s_groups_count: as long as - * that retains its old value, nobody is going to access the new - * group. - * - * So first we update all the descriptor metadata for the new - * group; then we update the total disk blocks count; then we - * update the groups count to enable the group; then finally we - * update the free space counts so that the system can start - * using the new disk blocks. - */ + /* + * OK, now we've set up the new group. Time to make it active. + * + * Current kernels don't lock all allocations via lock_super(), + * so we have to be safe wrt. concurrent accesses the group + * data. So we need to be careful to set all of the relevant + * group descriptor data etc. *before* we enable the group. + * + * The key field here is sbi->s_groups_count: as long as + * that retains its old value, nobody is going to access the new + * group. + * + * So first we update all the descriptor metadata for the new + * group; then we update the total disk blocks count; then we + * update the groups count to enable the group; then finally we + * update the free space counts so that the system can start + * using the new disk blocks. + */ /* Update group descriptor block for new group */ gdp = (struct ext4_group_desc *)((char *)primary->b_data + @@ -946,7 +947,8 @@ exit_put: return err; } /* ext4_group_add */ -/* Extend the filesystem to the new number of blocks specified. This entry +/* + * Extend the filesystem to the new number of blocks specified. This entry * point is only used to extend the current filesystem to the end of the last * existing group. It can be accessed via ioctl, or by "remount,resize=<size>" * for emergencies (because it has no dependencies on reserved blocks). @@ -1024,7 +1026,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, o_blocks_count + add, add); /* See if the device is actually as big as what was requested */ - bh = sb_bread(sb, o_blocks_count + add -1); + bh = sb_bread(sb, o_blocks_count + add - 1); if (!bh) { ext4_warning(sb, __func__, "can't read last block, resize aborted"); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 1e69f29..566344b 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -49,20 +49,19 @@ static int ext4_load_journal(struct super_block *, struct ext4_super_block *, unsigned long journal_devnum); static int ext4_create_journal(struct super_block *, struct ext4_super_block *, unsigned int); -static void ext4_commit_super (struct super_block * sb, - struct ext4_super_block * es, - int sync); -static void ext4_mark_recovery_complete(struct super_block * sb, - struct ext4_super_block * es); -static void ext4_clear_journal_err(struct super_block * sb, - struct ext4_super_block * es); +static void ext4_commit_super(struct super_block *sb, + struct ext4_super_block *es, int sync); +static void ext4_mark_recovery_complete(struct super_block *sb, + struct ext4_super_block *es); +static void ext4_clear_journal_err(struct super_block *sb, + struct ext4_super_block *es); static int ext4_sync_fs(struct super_block *sb, int wait); -static const char *ext4_decode_error(struct super_block * sb, int errno, +static const char *ext4_decode_error(struct super_block *sb, int errno, char nbuf[16]); -static int ext4_remount (struct super_block * sb, int * flags, char * data); -static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf); +static int ext4_remount(struct super_block *sb, int *flags, char *data); +static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf); static void ext4_unlockfs(struct super_block *sb); -static void ext4_write_super (struct super_block * sb); +static void ext4_write_super(struct super_block *sb); static void ext4_write_super_lockfs(struct super_block *sb); @@ -211,15 +210,15 @@ static void ext4_handle_error(struct super_block *sb) if (sb->s_flags & MS_RDONLY) return; - if (!test_opt (sb, ERRORS_CONT)) { + if (!test_opt(sb, ERRORS_CONT)) { journal_t *journal = EXT4_SB(sb)->s_journal; EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT; if (journal) jbd2_journal_abort(journal, -EIO); } - if (test_opt (sb, ERRORS_RO)) { - printk (KERN_CRIT "Remounting filesystem read-only\n"); + if (test_opt(sb, ERRORS_RO)) { + printk(KERN_CRIT "Remounting filesystem read-only\n"); sb->s_flags |= MS_RDONLY; } ext4_commit_super(sb, es, 1); @@ -228,13 +227,13 @@ static void ext4_handle_error(struct super_block *sb) sb->s_id); } -void ext4_error (struct super_block * sb, const char * function, - const char * fmt, ...) +void ext4_error(struct super_block *sb, const char *function, + const char *fmt, ...) { va_list args; va_start(args, fmt); - printk(KERN_CRIT "EXT4-fs error (device %s): %s: ",sb->s_id, function); + printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function); vprintk(fmt, args); printk("\n"); va_end(args); @@ -242,7 +241,7 @@ void ext4_error (struct super_block * sb, const char * function, ext4_handle_error(sb); } -static const char *ext4_decode_error(struct super_block * sb, int errno, +static const char *ext4_decode_error(struct super_block *sb, int errno, char nbuf[16]) { char *errstr = NULL; @@ -278,8 +277,7 @@ static const char *ext4_decode_error(struct super_block * sb, int errno, /* __ext4_std_error decodes expected errors from journaling functions * automatically and invokes the appropriate error response. */ -void __ext4_std_error (struct super_block * sb, const char * function, - int errno) +void __ext4_std_error(struct super_block *sb, const char *function, int errno) { char nbuf[16]; const char *errstr; @@ -292,8 +290,8 @@ void __ext4_std_error (struct super_block * sb, const char * function, return; errstr = ext4_decode_error(sb, errno, nbuf); - printk (KERN_CRIT "EXT4-fs error (device %s) in %s: %s\n", - sb->s_id, function, errstr); + printk(KERN_CRIT "EXT4-fs error (device %s) in %s: %s\n", + sb->s_id, function, errstr); ext4_handle_error(sb); } @@ -308,15 +306,15 @@ void __ext4_std_error (struct super_block * sb, const char * function, * case we take the easy way out and panic immediately. */ -void ext4_abort (struct super_block * sb, const char * function, - const char * fmt, ...) +void ext4_abort(struct super_block *sb, const char *function, + const char *fmt, ...) { va_list args; - printk (KERN_CRIT "ext4_abort called.\n"); + printk(KERN_CRIT "ext4_abort called.\n"); va_start(args, fmt); - printk(KERN_CRIT "EXT4-fs error (device %s): %s: ",sb->s_id, function); + printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function); vprintk(fmt, args); printk("\n"); va_end(args); @@ -334,8 +332,8 @@ void ext4_abort (struct super_block * sb, const char * function, jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO); } -void ext4_warning (struct super_block * sb, const char * function, - const char * fmt, ...) +void ext4_warning(struct super_block *sb, const char *function, + const char *fmt, ...) { va_list args; @@ -496,7 +494,7 @@ static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi) } } -static void ext4_put_super (struct super_block * sb) +static void ext4_put_super(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; @@ -570,6 +568,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) #endif ei->i_block_alloc_info = NULL; ei->vfs_inode.i_version = 1; + ei->vfs_inode.i_data.writeback_index = 0; memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); INIT_LIST_HEAD(&ei->i_prealloc_list); spin_lock_init(&ei->i_prealloc_lock); @@ -647,7 +646,8 @@ static void ext4_clear_inode(struct inode *inode) &EXT4_I(inode)->jinode); } -static inline void ext4_show_quota_options(struct seq_file *seq, struct super_block *sb) +static inline void ext4_show_quota_options(struct seq_file *seq, + struct super_block *sb) { #if defined(CONFIG_QUOTA) struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -822,8 +822,8 @@ static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid, } #ifdef CONFIG_QUOTA -#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group") -#define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA)) +#define QTYPE2NAME(t) ((t) == USRQUOTA?"user":"group") +#define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA)) static int ext4_dquot_initialize(struct inode *inode, int type); static int ext4_dquot_drop(struct inode *inode); @@ -991,12 +991,12 @@ static ext4_fsblk_t get_sb_block(void **data) return sb_block; } -static int parse_options (char *options, struct super_block *sb, - unsigned int *inum, unsigned long *journal_devnum, - ext4_fsblk_t *n_blocks_count, int is_remount) +static int parse_options(char *options, struct super_block *sb, + unsigned int *inum, unsigned long *journal_devnum, + ext4_fsblk_t *n_blocks_count, int is_remount) { struct ext4_sb_info *sbi = EXT4_SB(sb); - char * p; + char *p; substring_t args[MAX_OPT_ARGS]; int data_opt = 0; int option; @@ -1009,7 +1009,7 @@ static int parse_options (char *options, struct super_block *sb, if (!options) return 1; - while ((p = strsep (&options, ",")) != NULL) { + while ((p = strsep(&options, ",")) != NULL) { int token; if (!*p) continue; @@ -1017,16 +1017,16 @@ static int parse_options (char *options, struct super_block *sb, token = match_token(p, tokens, args); switch (token) { case Opt_bsd_df: - clear_opt (sbi->s_mount_opt, MINIX_DF); + clear_opt(sbi->s_mount_opt, MINIX_DF); break; case Opt_minix_df: - set_opt (sbi->s_mount_opt, MINIX_DF); + set_opt(sbi->s_mount_opt, MINIX_DF); break; case Opt_grpid: - set_opt (sbi->s_mount_opt, GRPID); + set_opt(sbi->s_mount_opt, GRPID); break; case Opt_nogrpid: - clear_opt (sbi->s_mount_opt, GRPID); + clear_opt(sbi->s_mount_opt, GRPID); break; case Opt_resuid: if (match_int(&args[0], &option)) @@ -1043,41 +1043,41 @@ static int parse_options (char *options, struct super_block *sb, /* *sb_block = match_int(&args[0]); */ break; case Opt_err_panic: - clear_opt (sbi->s_mount_opt, ERRORS_CONT); - clear_opt (sbi->s_mount_opt, ERRORS_RO); - set_opt (sbi->s_mount_opt, ERRORS_PANIC); + clear_opt(sbi->s_mount_opt, ERRORS_CONT); + clear_opt(sbi->s_mount_opt, ERRORS_RO); + set_opt(sbi->s_mount_opt, ERRORS_PANIC); break; case Opt_err_ro: - clear_opt (sbi->s_mount_opt, ERRORS_CONT); - clear_opt (sbi->s_mount_opt, ERRORS_PANIC); - set_opt (sbi->s_mount_opt, ERRORS_RO); + clear_opt(sbi->s_mount_opt, ERRORS_CONT); + clear_opt(sbi->s_mount_opt, ERRORS_PANIC); + set_opt(sbi->s_mount_opt, ERRORS_RO); break; case Opt_err_cont: - clear_opt (sbi->s_mount_opt, ERRORS_RO); - clear_opt (sbi->s_mount_opt, ERRORS_PANIC); - set_opt (sbi->s_mount_opt, ERRORS_CONT); + clear_opt(sbi->s_mount_opt, ERRORS_RO); + clear_opt(sbi->s_mount_opt, ERRORS_PANIC); + set_opt(sbi->s_mount_opt, ERRORS_CONT); break; case Opt_nouid32: - set_opt (sbi->s_mount_opt, NO_UID32); + set_opt(sbi->s_mount_opt, NO_UID32); break; case Opt_nocheck: - clear_opt (sbi->s_mount_opt, CHECK); + clear_opt(sbi->s_mount_opt, CHECK); break; case Opt_debug: - set_opt (sbi->s_mount_opt, DEBUG); + set_opt(sbi->s_mount_opt, DEBUG); break; case Opt_oldalloc: - set_opt (sbi->s_mount_opt, OLDALLOC); + set_opt(sbi->s_mount_opt, OLDALLOC); break; case Opt_orlov: - clear_opt (sbi->s_mount_opt, OLDALLOC); + clear_opt(sbi->s_mount_opt, OLDALLOC); break; #ifdef CONFIG_EXT4DEV_FS_XATTR case Opt_user_xattr: - set_opt (sbi->s_mount_opt, XATTR_USER); + set_opt(sbi->s_mount_opt, XATTR_USER); break; case Opt_nouser_xattr: - clear_opt (sbi->s_mount_opt, XATTR_USER); + clear_opt(sbi->s_mount_opt, XATTR_USER); break; #else case Opt_user_xattr: @@ -1115,7 +1115,7 @@ static int parse_options (char *options, struct super_block *sb, "journal on remount\n"); return 0; } - set_opt (sbi->s_mount_opt, UPDATE_JOURNAL); + set_opt(sbi->s_mount_opt, UPDATE_JOURNAL); break; case Opt_journal_inum: if (is_remount) { @@ -1145,7 +1145,7 @@ static int parse_options (char *options, struct super_block *sb, set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM); break; case Opt_noload: - set_opt (sbi->s_mount_opt, NOLOAD); + set_opt(sbi->s_mount_opt, NOLOAD); break; case Opt_commit: if (match_int(&args[0], &option)) @@ -1331,7 +1331,7 @@ set_qf_format: "on this filesystem, use tune2fs\n"); return 0; } - set_opt (sbi->s_mount_opt, EXTENTS); + set_opt(sbi->s_mount_opt, EXTENTS); break; case Opt_noextents: /* @@ -1348,7 +1348,7 @@ set_qf_format: "-o noextents options\n"); return 0; } - clear_opt (sbi->s_mount_opt, EXTENTS); + clear_opt(sbi->s_mount_opt, EXTENTS); break; case Opt_i_version: set_opt(sbi->s_mount_opt, I_VERSION); @@ -1374,9 +1374,9 @@ set_qf_format: set_opt(sbi->s_mount_opt, DELALLOC); break; default: - printk (KERN_ERR - "EXT4-fs: Unrecognized mount option \"%s\" " - "or missing value\n", p); + printk(KERN_ERR + "EXT4-fs: Unrecognized mount option \"%s\" " + "or missing value\n", p); return 0; } } @@ -1423,31 +1423,31 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, int res = 0; if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) { - printk (KERN_ERR "EXT4-fs warning: revision level too high, " - "forcing read-only mode\n"); + printk(KERN_ERR "EXT4-fs warning: revision level too high, " + "forcing read-only mode\n"); res = MS_RDONLY; } if (read_only) return res; if (!(sbi->s_mount_state & EXT4_VALID_FS)) - printk (KERN_WARNING "EXT4-fs warning: mounting unchecked fs, " - "running e2fsck is recommended\n"); + printk(KERN_WARNING "EXT4-fs warning: mounting unchecked fs, " + "running e2fsck is recommended\n"); else if ((sbi->s_mount_state & EXT4_ERROR_FS)) - printk (KERN_WARNING - "EXT4-fs warning: mounting fs with errors, " - "running e2fsck is recommended\n"); + printk(KERN_WARNING + "EXT4-fs warning: mounting fs with errors, " + "running e2fsck is recommended\n"); else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 && le16_to_cpu(es->s_mnt_count) >= (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count)) - printk (KERN_WARNING - "EXT4-fs warning: maximal mount count reached, " - "running e2fsck is recommended\n"); + printk(KERN_WARNING + "EXT4-fs warning: maximal mount count reached, " + "running e2fsck is recommended\n"); else if (le32_to_cpu(es->s_checkinterval) && (le32_to_cpu(es->s_lastcheck) + le32_to_cpu(es->s_checkinterval) <= get_seconds())) - printk (KERN_WARNING - "EXT4-fs warning: checktime reached, " - "running e2fsck is recommended\n"); + printk(KERN_WARNING + "EXT4-fs warning: checktime reached, " + "running e2fsck is recommended\n"); #if 0 /* @@@ We _will_ want to clear the valid bit if we find * inconsistencies, to force a fsck at reboot. But for @@ -1506,14 +1506,13 @@ static int ext4_fill_flex_info(struct super_block *sb) flex_group_count = (sbi->s_groups_count + groups_per_flex - 1) / groups_per_flex; - sbi->s_flex_groups = kmalloc(flex_group_count * + sbi->s_flex_groups = kzalloc(flex_group_count * sizeof(struct flex_groups), GFP_KERNEL); if (sbi->s_flex_groups == NULL) { - printk(KERN_ERR "EXT4-fs: not enough memory\n"); + printk(KERN_ERR "EXT4-fs: not enough memory for " + "%lu flex groups\n", flex_group_count); goto failed; } - memset(sbi->s_flex_groups, 0, flex_group_count * - sizeof(struct flex_groups)); gdp = ext4_get_group_desc(sb, 1, &bh); block_bitmap = ext4_block_bitmap(sb, gdp) - 1; @@ -1597,16 +1596,14 @@ static int ext4_check_descriptors(struct super_block *sb) (EXT4_BLOCKS_PER_GROUP(sb) - 1); block_bitmap = ext4_block_bitmap(sb, gdp); - if (block_bitmap < first_block || block_bitmap > last_block) - { + if (block_bitmap < first_block || block_bitmap > last_block) { printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " "Block bitmap for group %lu not in group " "(block %llu)!", i, block_bitmap); return 0; } inode_bitmap = ext4_inode_bitmap(sb, gdp); - if (inode_bitmap < first_block || inode_bitmap > last_block) - { + if (inode_bitmap < first_block || inode_bitmap > last_block) { printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " "Inode bitmap for group %lu not in group " "(block %llu)!", i, inode_bitmap); @@ -1614,26 +1611,28 @@ static int ext4_check_descriptors(struct super_block *sb) } inode_table = ext4_inode_table(sb, gdp); if (inode_table < first_block || - inode_table + sbi->s_itb_per_group - 1 > last_block) - { + inode_table + sbi->s_itb_per_group - 1 > last_block) { printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " "Inode table for group %lu not in group " "(block %llu)!", i, inode_table); return 0; } + spin_lock(sb_bgl_lock(sbi, i)); if (!ext4_group_desc_csum_verify(sbi, i, gdp)) { printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " "Checksum for group %lu failed (%u!=%u)\n", i, le16_to_cpu(ext4_group_desc_csum(sbi, i, gdp)), le16_to_cpu(gdp->bg_checksum)); - return 0; + if (!(sb->s_flags & MS_RDONLY)) + return 0; } + spin_unlock(sb_bgl_lock(sbi, i)); if (!flexbg_flag) first_block += EXT4_BLOCKS_PER_GROUP(sb); } ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb)); - sbi->s_es->s_free_inodes_count=cpu_to_le32(ext4_count_free_inodes(sb)); + sbi->s_es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb)); return 1; } @@ -1654,8 +1653,8 @@ static int ext4_check_descriptors(struct super_block *sb) * e2fsck was run on this filesystem, and it must have already done the orphan * inode cleanup for us, so we can safely abort without any further action. */ -static void ext4_orphan_cleanup (struct super_block * sb, - struct ext4_super_block * es) +static void ext4_orphan_cleanup(struct super_block *sb, + struct ext4_super_block *es) { unsigned int s_flags = sb->s_flags; int nr_orphans = 0, nr_truncates = 0; @@ -1732,7 +1731,7 @@ static void ext4_orphan_cleanup (struct super_block * sb, iput(inode); /* The delete magic happens here! */ } -#define PLURAL(x) (x), ((x)==1) ? "" : "s" +#define PLURAL(x) (x), ((x) == 1) ? "" : "s" if (nr_orphans) printk(KERN_INFO "EXT4-fs: %s: %d orphan inode%s deleted\n", @@ -1899,12 +1898,12 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi) return 0; } -static int ext4_fill_super (struct super_block *sb, void *data, int silent) +static int ext4_fill_super(struct super_block *sb, void *data, int silent) __releases(kernel_lock) __acquires(kernel_lock) { - struct buffer_head * bh; + struct buffer_head *bh; struct ext4_super_block *es = NULL; struct ext4_sb_info *sbi; ext4_fsblk_t block; @@ -1953,7 +1952,7 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) } if (!(bh = sb_bread(sb, logical_sb_block))) { - printk (KERN_ERR "EXT4-fs: unable to read superblock\n"); + printk(KERN_ERR "EXT4-fs: unable to read superblock\n"); goto out_fail; } /* @@ -2026,8 +2025,8 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) set_opt(sbi->s_mount_opt, DELALLOC); - if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum, - NULL, 0)) + if (!parse_options((char *) data, sb, &journal_inum, &journal_devnum, + NULL, 0)) goto failed_mount; sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | @@ -2102,7 +2101,7 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) goto failed_mount; } - brelse (bh); + brelse(bh); logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE; offset = do_div(logical_sb_block, blocksize); bh = sb_bread(sb, logical_sb_block); @@ -2114,8 +2113,8 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) es = (struct ext4_super_block *)(((char *)bh->b_data) + offset); sbi->s_es = es; if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) { - printk (KERN_ERR - "EXT4-fs: Magic mismatch, very weird !\n"); + printk(KERN_ERR + "EXT4-fs: Magic mismatch, very weird !\n"); goto failed_mount; } } @@ -2132,9 +2131,9 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) || (!is_power_of_2(sbi->s_inode_size)) || (sbi->s_inode_size > blocksize)) { - printk (KERN_ERR - "EXT4-fs: unsupported inode size: %d\n", - sbi->s_inode_size); + printk(KERN_ERR + "EXT4-fs: unsupported inode size: %d\n", + sbi->s_inode_size); goto failed_mount; } if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) @@ -2166,20 +2165,20 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) sbi->s_mount_state = le16_to_cpu(es->s_state); sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb)); sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb)); - for (i=0; i < 4; i++) + for (i = 0; i < 4; i++) sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); sbi->s_def_hash_version = es->s_def_hash_version; if (sbi->s_blocks_per_group > blocksize * 8) { - printk (KERN_ERR - "EXT4-fs: #blocks per group too big: %lu\n", - sbi->s_blocks_per_group); + printk(KERN_ERR + "EXT4-fs: #blocks per group too big: %lu\n", + sbi->s_blocks_per_group); goto failed_mount; } if (sbi->s_inodes_per_group > blocksize * 8) { - printk (KERN_ERR - "EXT4-fs: #inodes per group too big: %lu\n", - sbi->s_inodes_per_group); + printk(KERN_ERR + "EXT4-fs: #inodes per group too big: %lu\n", + sbi->s_inodes_per_group); goto failed_mount; } @@ -2213,10 +2212,10 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) sbi->s_groups_count = blocks_count; db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / EXT4_DESC_PER_BLOCK(sb); - sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *), + sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *), GFP_KERNEL); if (sbi->s_group_desc == NULL) { - printk (KERN_ERR "EXT4-fs: not enough memory\n"); + printk(KERN_ERR "EXT4-fs: not enough memory\n"); goto failed_mount; } @@ -2226,13 +2225,13 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) block = descriptor_loc(sb, logical_sb_block, i); sbi->s_group_desc[i] = sb_bread(sb, block); if (!sbi->s_group_desc[i]) { - printk (KERN_ERR "EXT4-fs: " - "can't read group descriptor %d\n", i); + printk(KERN_ERR "EXT4-fs: " + "can't read group descriptor %d\n", i); db_count = i; goto failed_mount2; } } - if (!ext4_check_descriptors (sb)) { + if (!ext4_check_descriptors(sb)) { printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n"); goto failed_mount2; } @@ -2308,11 +2307,11 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) EXT4_SB(sb)->s_journal->j_failed_commit) { printk(KERN_CRIT "EXT4-fs error (device %s): " "ext4_fill_super: Journal transaction " - "%u is corrupt\n", sb->s_id, + "%u is corrupt\n", sb->s_id, EXT4_SB(sb)->s_journal->j_failed_commit); - if (test_opt (sb, ERRORS_RO)) { - printk (KERN_CRIT - "Mounting filesystem read-only\n"); + if (test_opt(sb, ERRORS_RO)) { + printk(KERN_CRIT + "Mounting filesystem read-only\n"); sb->s_flags |= MS_RDONLY; EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; es->s_state |= cpu_to_le16(EXT4_ERROR_FS); @@ -2332,9 +2331,9 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) goto failed_mount3; } else { if (!silent) - printk (KERN_ERR - "ext4: No journal on filesystem on %s\n", - sb->s_id); + printk(KERN_ERR + "ext4: No journal on filesystem on %s\n", + sb->s_id); goto failed_mount3; } @@ -2418,7 +2417,7 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) goto failed_mount4; } - ext4_setup_super (sb, es, sb->s_flags & MS_RDONLY); + ext4_setup_super(sb, es, sb->s_flags & MS_RDONLY); /* determine the minimum size of new large inodes, if present */ if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) { @@ -2457,12 +2456,12 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) ext4_orphan_cleanup(sb, es); EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS; if (needs_recovery) - printk (KERN_INFO "EXT4-fs: recovery complete.\n"); + printk(KERN_INFO "EXT4-fs: recovery complete.\n"); ext4_mark_recovery_complete(sb, es); - printk (KERN_INFO "EXT4-fs: mounted filesystem with %s data mode.\n", - test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ? "journal": - test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered": - "writeback"); + printk(KERN_INFO "EXT4-fs: mounted filesystem with %s data mode.\n", + test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ? "journal": + test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered": + "writeback"); if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - " @@ -2575,14 +2574,14 @@ static journal_t *ext4_get_journal(struct super_block *sb, static journal_t *ext4_get_dev_journal(struct super_block *sb, dev_t j_dev) { - struct buffer_head * bh; + struct buffer_head *bh; journal_t *journal; ext4_fsblk_t start; ext4_fsblk_t len; int hblock, blocksize; ext4_fsblk_t sb_block; unsigned long offset; - struct ext4_super_block * es; + struct ext4_super_block *es; struct block_device *bdev; bdev = ext4_blkdev_get(j_dev); @@ -2697,8 +2696,8 @@ static int ext4_load_journal(struct super_block *sb, "unavailable, cannot proceed.\n"); return -EROFS; } - printk (KERN_INFO "EXT4-fs: write access will " - "be enabled during recovery.\n"); + printk(KERN_INFO "EXT4-fs: write access will " + "be enabled during recovery.\n"); } } @@ -2751,8 +2750,8 @@ static int ext4_load_journal(struct super_block *sb, return 0; } -static int ext4_create_journal(struct super_block * sb, - struct ext4_super_block * es, +static int ext4_create_journal(struct super_block *sb, + struct ext4_super_block *es, unsigned int journal_inum) { journal_t *journal; @@ -2793,9 +2792,8 @@ static int ext4_create_journal(struct super_block * sb, return 0; } -static void ext4_commit_super (struct super_block * sb, - struct ext4_super_block * es, - int sync) +static void ext4_commit_super(struct super_block *sb, + struct ext4_super_block *es, int sync) { struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; @@ -2816,8 +2814,8 @@ static void ext4_commit_super (struct super_block * sb, * remounting) the filesystem readonly, then we will end up with a * consistent fs on disk. Record that fact. */ -static void ext4_mark_recovery_complete(struct super_block * sb, - struct ext4_super_block * es) +static void ext4_mark_recovery_complete(struct super_block *sb, + struct ext4_super_block *es) { journal_t *journal = EXT4_SB(sb)->s_journal; @@ -2839,8 +2837,8 @@ static void ext4_mark_recovery_complete(struct super_block * sb, * has recorded an error from a previous lifetime, move that error to the * main filesystem now. */ -static void ext4_clear_journal_err(struct super_block * sb, - struct ext4_super_block * es) +static void ext4_clear_journal_err(struct super_block *sb, + struct ext4_super_block *es) { journal_t *journal; int j_errno; @@ -2865,7 +2863,7 @@ static void ext4_clear_journal_err(struct super_block * sb, EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; es->s_state |= cpu_to_le16(EXT4_ERROR_FS); - ext4_commit_super (sb, es, 1); + ext4_commit_super(sb, es, 1); jbd2_journal_clear_err(journal); } @@ -2898,7 +2896,7 @@ int ext4_force_commit(struct super_block *sb) * This implicitly triggers the writebehind on sync(). */ -static void ext4_write_super (struct super_block * sb) +static void ext4_write_super(struct super_block *sb) { if (mutex_trylock(&sb->s_lock) != 0) BUG(); @@ -2954,13 +2952,14 @@ static void ext4_unlockfs(struct super_block *sb) } } -static int ext4_remount (struct super_block * sb, int * flags, char * data) +static int ext4_remount(struct super_block *sb, int *flags, char *data) { - struct ext4_super_block * es; + struct ext4_super_block *es; struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t n_blocks_count = 0; unsigned long old_sb_flags; struct ext4_mount_options old_opts; + ext4_group_t g; int err; #ifdef CONFIG_QUOTA int i; @@ -3039,6 +3038,26 @@ static int ext4_remount (struct super_block * sb, int * flags, char * data) } /* + * Make sure the group descriptor checksums + * are sane. If they aren't, refuse to + * remount r/w. + */ + for (g = 0; g < sbi->s_groups_count; g++) { + struct ext4_group_desc *gdp = + ext4_get_group_desc(sb, g, NULL); + + if (!ext4_group_desc_csum_verify(sbi, g, gdp)) { + printk(KERN_ERR + "EXT4-fs: ext4_remount: " + "Checksum for group %lu failed (%u!=%u)\n", + g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)), + le16_to_cpu(gdp->bg_checksum)); + err = -EINVAL; + goto restore_opts; + } + } + + /* * If we have an unprocessed orphan list hanging * around from a previously readonly bdev mount, * require a full umount/remount for now. @@ -3063,7 +3082,7 @@ static int ext4_remount (struct super_block * sb, int * flags, char * data) sbi->s_mount_state = le16_to_cpu(es->s_state); if ((err = ext4_group_extend(sb, es, n_blocks_count))) goto restore_opts; - if (!ext4_setup_super (sb, es, 0)) + if (!ext4_setup_super(sb, es, 0)) sb->s_flags &= ~MS_RDONLY; } } @@ -3093,7 +3112,7 @@ restore_opts: return err; } -static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf) +static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -3331,12 +3350,12 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, } /* Journaling quota? */ if (EXT4_SB(sb)->s_qf_names[type]) { - /* Quotafile not of fs root? */ + /* Quotafile not in fs root? */ if (nd.path.dentry->d_parent->d_inode != sb->s_root->d_inode) printk(KERN_WARNING "EXT4-fs: Quota file not on filesystem root. " "Journaled quota will not work.\n"); - } + } /* * When we journal data on quota file, we have to flush journal to see diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 93c5fdc..8954208 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1512,7 +1512,7 @@ static inline void ext4_xattr_hash_entry(struct ext4_xattr_header *header, char *name = entry->e_name; int n; - for (n=0; n < entry->e_name_len; n++) { + for (n = 0; n < entry->e_name_len; n++) { hash = (hash << NAME_HASH_SHIFT) ^ (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^ *name++; |