diff options
Diffstat (limited to 'fs')
57 files changed, 2549 insertions, 1673 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 620d934..8aa56bb 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -320,31 +320,21 @@ fail_option_alloc: struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, const char *dev_name, char *data) { - int retval = -EINVAL; struct p9_fid *fid; - int rc; + int rc = -ENOMEM; v9ses->uname = kstrdup(V9FS_DEFUSER, GFP_KERNEL); if (!v9ses->uname) - return ERR_PTR(-ENOMEM); + goto err_names; v9ses->aname = kstrdup(V9FS_DEFANAME, GFP_KERNEL); - if (!v9ses->aname) { - kfree(v9ses->uname); - return ERR_PTR(-ENOMEM); - } + if (!v9ses->aname) + goto err_names; init_rwsem(&v9ses->rename_sem); rc = bdi_setup_and_register(&v9ses->bdi, "9p"); - if (rc) { - kfree(v9ses->aname); - kfree(v9ses->uname); - return ERR_PTR(rc); - } - - spin_lock(&v9fs_sessionlist_lock); - list_add(&v9ses->slist, &v9fs_sessionlist); - spin_unlock(&v9fs_sessionlist_lock); + if (rc) + goto err_names; v9ses->uid = INVALID_UID; v9ses->dfltuid = V9FS_DEFUID; @@ -352,10 +342,9 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, v9ses->clnt = p9_client_create(dev_name, data); if (IS_ERR(v9ses->clnt)) { - retval = PTR_ERR(v9ses->clnt); - v9ses->clnt = NULL; + rc = PTR_ERR(v9ses->clnt); p9_debug(P9_DEBUG_ERROR, "problem initializing 9p client\n"); - goto error; + goto err_bdi; } v9ses->flags = V9FS_ACCESS_USER; @@ -368,10 +357,8 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, } rc = v9fs_parse_options(v9ses, data); - if (rc < 0) { - retval = rc; - goto error; - } + if (rc < 0) + goto err_clnt; v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ; @@ -405,10 +392,9 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, fid = p9_client_attach(v9ses->clnt, NULL, v9ses->uname, INVALID_UID, v9ses->aname); if (IS_ERR(fid)) { - retval = PTR_ERR(fid); - fid = NULL; + rc = PTR_ERR(fid); p9_debug(P9_DEBUG_ERROR, "cannot attach\n"); - goto error; + goto err_clnt; } if ((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_SINGLE) @@ -420,12 +406,20 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, /* register the session for caching */ v9fs_cache_session_get_cookie(v9ses); #endif + spin_lock(&v9fs_sessionlist_lock); + list_add(&v9ses->slist, &v9fs_sessionlist); + spin_unlock(&v9fs_sessionlist_lock); return fid; -error: +err_clnt: + p9_client_destroy(v9ses->clnt); +err_bdi: bdi_destroy(&v9ses->bdi); - return ERR_PTR(retval); +err_names: + kfree(v9ses->uname); + kfree(v9ses->aname); + return ERR_PTR(rc); } /** diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index e99a338..bf495ce 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -130,11 +130,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, fid = v9fs_session_init(v9ses, dev_name, data); if (IS_ERR(fid)) { retval = PTR_ERR(fid); - /* - * we need to call session_close to tear down some - * of the data structure setup by session_init - */ - goto close_session; + goto free_session; } sb = sget(fs_type, NULL, v9fs_set_super, flags, v9ses); @@ -195,8 +191,8 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, clunk_fid: p9_client_clunk(fid); -close_session: v9fs_session_close(v9ses); +free_session: kfree(v9ses); return ERR_PTR(retval); diff --git a/fs/block_dev.c b/fs/block_dev.c index c7e4163..f04c873 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -14,6 +14,7 @@ #include <linux/device_cgroup.h> #include <linux/highmem.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <linux/module.h> #include <linux/blkpg.h> #include <linux/magic.h> @@ -546,7 +547,8 @@ static struct file_system_type bd_type = { .kill_sb = kill_anon_super, }; -static struct super_block *blockdev_superblock __read_mostly; +struct super_block *blockdev_superblock __read_mostly; +EXPORT_SYMBOL_GPL(blockdev_superblock); void __init bdev_cache_init(void) { @@ -687,11 +689,6 @@ static struct block_device *bd_acquire(struct inode *inode) return bdev; } -int sb_is_blkdev_sb(struct super_block *sb) -{ - return sb == blockdev_superblock; -} - /* Call when you free inode */ void bd_forget(struct inode *inode) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 2ef9a4b..0bccf18 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1745,7 +1745,7 @@ static void end_workqueue_fn(struct btrfs_work *work) bio->bi_private = end_io_wq->private; bio->bi_end_io = end_io_wq->end_io; kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq); - bio_endio_nodec(bio, error); + bio_endio(bio, error); } static int cleaner_kthread(void *arg) @@ -3269,11 +3269,8 @@ static int write_dev_supers(struct btrfs_device *device, */ static void btrfs_end_empty_barrier(struct bio *bio, int err) { - if (err) { - if (err == -EOPNOTSUPP) - set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); + if (err) clear_bit(BIO_UPTODATE, &bio->bi_flags); - } if (bio->bi_private) complete(bio->bi_private); bio_put(bio); @@ -3301,11 +3298,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait) wait_for_completion(&device->flush_wait); - if (bio_flagged(bio, BIO_EOPNOTSUPP)) { - printk_in_rcu("BTRFS: disabling barriers on dev %s\n", - rcu_str_deref(device->name)); - device->nobarriers = 1; - } else if (!bio_flagged(bio, BIO_UPTODATE)) { + if (!bio_flagged(bio, BIO_UPTODATE)) { ret = -EIO; btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_FLUSH_ERRS); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c32d226..c374e1e 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2767,8 +2767,6 @@ static int __must_check submit_one_bio(int rw, struct bio *bio, else btrfsic_submit_bio(rw, bio); - if (bio_flagged(bio, BIO_EOPNOTSUPP)) - ret = -EOPNOTSUPP; bio_put(bio); return ret; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 174f5e1..53af23f 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -345,7 +345,7 @@ loop_lock: waitqueue_active(&fs_info->async_submit_wait)) wake_up(&fs_info->async_submit_wait); - BUG_ON(atomic_read(&cur->bi_cnt) == 0); + BUG_ON(atomic_read(&cur->__bi_cnt) == 0); /* * if we're doing the sync list, record that our @@ -5586,10 +5586,10 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio, int err) { - if (likely(bbio->flags & BTRFS_BIO_ORIG_BIO_SUBMITTED)) - bio_endio_nodec(bio, err); - else - bio_endio(bio, err); + bio->bi_private = bbio->private; + bio->bi_end_io = bbio->end_io; + bio_endio(bio, err); + btrfs_put_bbio(bbio); } @@ -5633,8 +5633,6 @@ static void btrfs_end_bio(struct bio *bio, int err) bio = bbio->orig_bio; } - bio->bi_private = bbio->private; - bio->bi_end_io = bbio->end_io; btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; /* only send an error to the higher layers if it is * beyond the tolerance of the btrfs bio @@ -5816,8 +5814,6 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) /* Shoud be the original bio. */ WARN_ON(bio != bbio->orig_bio); - bio->bi_private = bbio->private; - bio->bi_end_io = bbio->end_io; btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; bio->bi_iter.bi_sector = logical >> 9; @@ -5898,10 +5894,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, if (dev_nr < total_devs - 1) { bio = btrfs_bio_clone(first_bio, GFP_NOFS); BUG_ON(!bio); /* -ENOMEM */ - } else { + } else bio = first_bio; - bbio->flags |= BTRFS_BIO_ORIG_BIO_SUBMITTED; - } submit_stripe_bio(root, bbio, bio, bbio->stripes[dev_nr].physical, dev_nr, rw, diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index ebc3133..cedae03 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -292,8 +292,6 @@ struct btrfs_bio_stripe { struct btrfs_bio; typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err); -#define BTRFS_BIO_ORIG_BIO_SUBMITTED (1 << 0) - struct btrfs_bio { atomic_t refs; atomic_t stripes_pending; diff --git a/fs/buffer.c b/fs/buffer.c index c7a5602..1cf7a53 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -30,6 +30,7 @@ #include <linux/quotaops.h> #include <linux/highmem.h> #include <linux/export.h> +#include <linux/backing-dev.h> #include <linux/writeback.h> #include <linux/hash.h> #include <linux/suspend.h> @@ -44,6 +45,9 @@ #include <trace/events/block.h> static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); +static int submit_bh_wbc(int rw, struct buffer_head *bh, + unsigned long bio_flags, + struct writeback_control *wbc); #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) @@ -623,21 +627,22 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode); * * If warn is true, then emit a warning if the page is not uptodate and has * not been truncated. + * + * The caller must hold mem_cgroup_begin_page_stat() lock. */ -static void __set_page_dirty(struct page *page, - struct address_space *mapping, int warn) +static void __set_page_dirty(struct page *page, struct address_space *mapping, + struct mem_cgroup *memcg, int warn) { unsigned long flags; spin_lock_irqsave(&mapping->tree_lock, flags); if (page->mapping) { /* Race with truncate? */ WARN_ON_ONCE(warn && !PageUptodate(page)); - account_page_dirtied(page, mapping); + account_page_dirtied(page, mapping, memcg); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } spin_unlock_irqrestore(&mapping->tree_lock, flags); - __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } /* @@ -668,6 +673,7 @@ static void __set_page_dirty(struct page *page, int __set_page_dirty_buffers(struct page *page) { int newly_dirty; + struct mem_cgroup *memcg; struct address_space *mapping = page_mapping(page); if (unlikely(!mapping)) @@ -683,11 +689,22 @@ int __set_page_dirty_buffers(struct page *page) bh = bh->b_this_page; } while (bh != head); } + /* + * Use mem_group_begin_page_stat() to keep PageDirty synchronized with + * per-memcg dirty page counters. + */ + memcg = mem_cgroup_begin_page_stat(page); newly_dirty = !TestSetPageDirty(page); spin_unlock(&mapping->private_lock); if (newly_dirty) - __set_page_dirty(page, mapping, 1); + __set_page_dirty(page, mapping, memcg, 1); + + mem_cgroup_end_page_stat(memcg); + + if (newly_dirty) + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); + return newly_dirty; } EXPORT_SYMBOL(__set_page_dirty_buffers); @@ -1158,11 +1175,18 @@ void mark_buffer_dirty(struct buffer_head *bh) if (!test_set_buffer_dirty(bh)) { struct page *page = bh->b_page; + struct address_space *mapping = NULL; + struct mem_cgroup *memcg; + + memcg = mem_cgroup_begin_page_stat(page); if (!TestSetPageDirty(page)) { - struct address_space *mapping = page_mapping(page); + mapping = page_mapping(page); if (mapping) - __set_page_dirty(page, mapping, 0); + __set_page_dirty(page, mapping, memcg, 0); } + mem_cgroup_end_page_stat(memcg); + if (mapping) + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } } EXPORT_SYMBOL(mark_buffer_dirty); @@ -1684,8 +1708,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page, struct buffer_head *bh, *head; unsigned int blocksize, bbits; int nr_underway = 0; - int write_op = (wbc->sync_mode == WB_SYNC_ALL ? - WRITE_SYNC : WRITE); + int write_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); head = create_page_buffers(page, inode, (1 << BH_Dirty)|(1 << BH_Uptodate)); @@ -1774,7 +1797,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page, do { struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { - submit_bh(write_op, bh); + submit_bh_wbc(write_op, bh, 0, wbc); nr_underway++; } bh = next; @@ -1828,7 +1851,7 @@ recover: struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { clear_buffer_dirty(bh); - submit_bh(write_op, bh); + submit_bh_wbc(write_op, bh, 0, wbc); nr_underway++; } bh = next; @@ -2938,10 +2961,6 @@ static void end_bio_bh_io_sync(struct bio *bio, int err) { struct buffer_head *bh = bio->bi_private; - if (err == -EOPNOTSUPP) { - set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); - } - if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags))) set_bit(BH_Quiet, &bh->b_state); @@ -2997,10 +3016,10 @@ void guard_bio_eod(int rw, struct bio *bio) } } -int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags) +static int submit_bh_wbc(int rw, struct buffer_head *bh, + unsigned long bio_flags, struct writeback_control *wbc) { struct bio *bio; - int ret = 0; BUG_ON(!buffer_locked(bh)); BUG_ON(!buffer_mapped(bh)); @@ -3020,6 +3039,11 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags) */ bio = bio_alloc(GFP_NOIO, 1); + if (wbc) { + wbc_init_bio(wbc, bio); + wbc_account_io(wbc, bh->b_page, bh->b_size); + } + bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_bdev = bh->b_bdev; bio->bi_io_vec[0].bv_page = bh->b_page; @@ -3041,20 +3065,19 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags) if (buffer_prio(bh)) rw |= REQ_PRIO; - bio_get(bio); submit_bio(rw, bio); + return 0; +} - if (bio_flagged(bio, BIO_EOPNOTSUPP)) - ret = -EOPNOTSUPP; - - bio_put(bio); - return ret; +int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags) +{ + return submit_bh_wbc(rw, bh, bio_flags, NULL); } EXPORT_SYMBOL_GPL(_submit_bh); int submit_bh(int rw, struct buffer_head *bh) { - return _submit_bh(rw, bh, 0); + return submit_bh_wbc(rw, bh, 0, NULL); } EXPORT_SYMBOL(submit_bh); @@ -3243,8 +3266,8 @@ int try_to_free_buffers(struct page *page) * to synchronise against __set_page_dirty_buffers and prevent the * dirty bit from being lost. */ - if (ret && TestClearPageDirty(page)) - account_page_cleaned(page, mapping); + if (ret) + cancel_dirty_page(page); spin_unlock(&mapping->private_lock); out: if (buffers_to_free) { diff --git a/fs/ext2/super.c b/fs/ext2/super.c index d0e746e..900e19c 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -882,6 +882,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | ((EXT2_SB(sb)->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); + sb->s_iflags |= SB_I_CGROUPWB; if (le32_to_cpu(es->s_rev_level) == EXT2_GOOD_OLD_REV && (EXT2_HAS_COMPAT_FEATURE(sb, ~0U) || diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index 024f228..bf8bc8a 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -72,6 +72,7 @@ config EXT4_ENCRYPTION select CRYPTO_ECB select CRYPTO_XTS select CRYPTO_CTS + select CRYPTO_CTR select CRYPTO_SHA256 select KEYS select ENCRYPTED_KEYS diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 955bf49..cd6ea29 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -369,7 +369,7 @@ static void ext4_validate_block_bitmap(struct super_block *sb, struct ext4_group_info *grp = ext4_get_group_info(sb, block_group); struct ext4_sb_info *sbi = EXT4_SB(sb); - if (buffer_verified(bh)) + if (buffer_verified(bh) || EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) return; ext4_lock_group(sb, block_group); @@ -446,7 +446,7 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) unlock_buffer(bh); if (err) ext4_error(sb, "Checksum bad for grp %u", block_group); - return bh; + goto verify; } ext4_unlock_group(sb, block_group); if (buffer_uptodate(bh)) { diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index 8ff1527..4573155 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -55,6 +55,9 @@ static mempool_t *ext4_bounce_page_pool; static LIST_HEAD(ext4_free_crypto_ctxs); static DEFINE_SPINLOCK(ext4_crypto_ctx_lock); +static struct kmem_cache *ext4_crypto_ctx_cachep; +struct kmem_cache *ext4_crypt_info_cachep; + /** * ext4_release_crypto_ctx() - Releases an encryption context * @ctx: The encryption context to release. @@ -68,18 +71,12 @@ void ext4_release_crypto_ctx(struct ext4_crypto_ctx *ctx) { unsigned long flags; - if (ctx->bounce_page) { - if (ctx->flags & EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL) - __free_page(ctx->bounce_page); - else - mempool_free(ctx->bounce_page, ext4_bounce_page_pool); - ctx->bounce_page = NULL; - } - ctx->control_page = NULL; + if (ctx->flags & EXT4_WRITE_PATH_FL && ctx->w.bounce_page) + mempool_free(ctx->w.bounce_page, ext4_bounce_page_pool); + ctx->w.bounce_page = NULL; + ctx->w.control_page = NULL; if (ctx->flags & EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL) { - if (ctx->tfm) - crypto_free_tfm(ctx->tfm); - kfree(ctx); + kmem_cache_free(ext4_crypto_ctx_cachep, ctx); } else { spin_lock_irqsave(&ext4_crypto_ctx_lock, flags); list_add(&ctx->free_list, &ext4_free_crypto_ctxs); @@ -88,23 +85,6 @@ void ext4_release_crypto_ctx(struct ext4_crypto_ctx *ctx) } /** - * ext4_alloc_and_init_crypto_ctx() - Allocates and inits an encryption context - * @mask: The allocation mask. - * - * Return: An allocated and initialized encryption context on success. An error - * value or NULL otherwise. - */ -static struct ext4_crypto_ctx *ext4_alloc_and_init_crypto_ctx(gfp_t mask) -{ - struct ext4_crypto_ctx *ctx = kzalloc(sizeof(struct ext4_crypto_ctx), - mask); - - if (!ctx) - return ERR_PTR(-ENOMEM); - return ctx; -} - -/** * ext4_get_crypto_ctx() - Gets an encryption context * @inode: The inode for which we are doing the crypto * @@ -118,10 +98,10 @@ struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode) struct ext4_crypto_ctx *ctx = NULL; int res = 0; unsigned long flags; - struct ext4_encryption_key *key = &EXT4_I(inode)->i_encryption_key; + struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; - if (!ext4_read_workqueue) - ext4_init_crypto(); + if (ci == NULL) + return ERR_PTR(-ENOKEY); /* * We first try getting the ctx from a free list because in @@ -140,50 +120,16 @@ struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode) list_del(&ctx->free_list); spin_unlock_irqrestore(&ext4_crypto_ctx_lock, flags); if (!ctx) { - ctx = ext4_alloc_and_init_crypto_ctx(GFP_NOFS); - if (IS_ERR(ctx)) { - res = PTR_ERR(ctx); + ctx = kmem_cache_zalloc(ext4_crypto_ctx_cachep, GFP_NOFS); + if (!ctx) { + res = -ENOMEM; goto out; } ctx->flags |= EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL; } else { ctx->flags &= ~EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL; } - - /* Allocate a new Crypto API context if we don't already have - * one or if it isn't the right mode. */ - BUG_ON(key->mode == EXT4_ENCRYPTION_MODE_INVALID); - if (ctx->tfm && (ctx->mode != key->mode)) { - crypto_free_tfm(ctx->tfm); - ctx->tfm = NULL; - ctx->mode = EXT4_ENCRYPTION_MODE_INVALID; - } - if (!ctx->tfm) { - switch (key->mode) { - case EXT4_ENCRYPTION_MODE_AES_256_XTS: - ctx->tfm = crypto_ablkcipher_tfm( - crypto_alloc_ablkcipher("xts(aes)", 0, 0)); - break; - case EXT4_ENCRYPTION_MODE_AES_256_GCM: - /* TODO(mhalcrow): AEAD w/ gcm(aes); - * crypto_aead_setauthsize() */ - ctx->tfm = ERR_PTR(-ENOTSUPP); - break; - default: - BUG(); - } - if (IS_ERR_OR_NULL(ctx->tfm)) { - res = PTR_ERR(ctx->tfm); - ctx->tfm = NULL; - goto out; - } - ctx->mode = key->mode; - } - BUG_ON(key->size != ext4_encryption_key_size(key->mode)); - - /* There shouldn't be a bounce page attached to the crypto - * context at this point. */ - BUG_ON(ctx->bounce_page); + ctx->flags &= ~EXT4_WRITE_PATH_FL; out: if (res) { @@ -204,20 +150,8 @@ void ext4_exit_crypto(void) { struct ext4_crypto_ctx *pos, *n; - list_for_each_entry_safe(pos, n, &ext4_free_crypto_ctxs, free_list) { - if (pos->bounce_page) { - if (pos->flags & - EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL) { - __free_page(pos->bounce_page); - } else { - mempool_free(pos->bounce_page, - ext4_bounce_page_pool); - } - } - if (pos->tfm) - crypto_free_tfm(pos->tfm); - kfree(pos); - } + list_for_each_entry_safe(pos, n, &ext4_free_crypto_ctxs, free_list) + kmem_cache_free(ext4_crypto_ctx_cachep, pos); INIT_LIST_HEAD(&ext4_free_crypto_ctxs); if (ext4_bounce_page_pool) mempool_destroy(ext4_bounce_page_pool); @@ -225,6 +159,12 @@ void ext4_exit_crypto(void) if (ext4_read_workqueue) destroy_workqueue(ext4_read_workqueue); ext4_read_workqueue = NULL; + if (ext4_crypto_ctx_cachep) + kmem_cache_destroy(ext4_crypto_ctx_cachep); + ext4_crypto_ctx_cachep = NULL; + if (ext4_crypt_info_cachep) + kmem_cache_destroy(ext4_crypt_info_cachep); + ext4_crypt_info_cachep = NULL; } /** @@ -237,23 +177,31 @@ void ext4_exit_crypto(void) */ int ext4_init_crypto(void) { - int i, res; + int i, res = -ENOMEM; mutex_lock(&crypto_init); if (ext4_read_workqueue) goto already_initialized; ext4_read_workqueue = alloc_workqueue("ext4_crypto", WQ_HIGHPRI, 0); - if (!ext4_read_workqueue) { - res = -ENOMEM; + if (!ext4_read_workqueue) + goto fail; + + ext4_crypto_ctx_cachep = KMEM_CACHE(ext4_crypto_ctx, + SLAB_RECLAIM_ACCOUNT); + if (!ext4_crypto_ctx_cachep) + goto fail; + + ext4_crypt_info_cachep = KMEM_CACHE(ext4_crypt_info, + SLAB_RECLAIM_ACCOUNT); + if (!ext4_crypt_info_cachep) goto fail; - } for (i = 0; i < num_prealloc_crypto_ctxs; i++) { struct ext4_crypto_ctx *ctx; - ctx = ext4_alloc_and_init_crypto_ctx(GFP_KERNEL); - if (IS_ERR(ctx)) { - res = PTR_ERR(ctx); + ctx = kmem_cache_zalloc(ext4_crypto_ctx_cachep, GFP_NOFS); + if (!ctx) { + res = -ENOMEM; goto fail; } list_add(&ctx->free_list, &ext4_free_crypto_ctxs); @@ -317,32 +265,11 @@ static int ext4_page_crypto(struct ext4_crypto_ctx *ctx, struct ablkcipher_request *req = NULL; DECLARE_EXT4_COMPLETION_RESULT(ecr); struct scatterlist dst, src; - struct ext4_inode_info *ei = EXT4_I(inode); - struct crypto_ablkcipher *atfm = __crypto_ablkcipher_cast(ctx->tfm); + struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; + struct crypto_ablkcipher *tfm = ci->ci_ctfm; int res = 0; - BUG_ON(!ctx->tfm); - BUG_ON(ctx->mode != ei->i_encryption_key.mode); - - if (ctx->mode != EXT4_ENCRYPTION_MODE_AES_256_XTS) { - printk_ratelimited(KERN_ERR - "%s: unsupported crypto algorithm: %d\n", - __func__, ctx->mode); - return -ENOTSUPP; - } - - crypto_ablkcipher_clear_flags(atfm, ~0); - crypto_tfm_set_flags(ctx->tfm, CRYPTO_TFM_REQ_WEAK_KEY); - - res = crypto_ablkcipher_setkey(atfm, ei->i_encryption_key.raw, - ei->i_encryption_key.size); - if (res) { - printk_ratelimited(KERN_ERR - "%s: crypto_ablkcipher_setkey() failed\n", - __func__); - return res; - } - req = ablkcipher_request_alloc(atfm, GFP_NOFS); + req = ablkcipher_request_alloc(tfm, GFP_NOFS); if (!req) { printk_ratelimited(KERN_ERR "%s: crypto_request_alloc() failed\n", @@ -384,6 +311,15 @@ static int ext4_page_crypto(struct ext4_crypto_ctx *ctx, return 0; } +static struct page *alloc_bounce_page(struct ext4_crypto_ctx *ctx) +{ + ctx->w.bounce_page = mempool_alloc(ext4_bounce_page_pool, GFP_NOWAIT); + if (ctx->w.bounce_page == NULL) + return ERR_PTR(-ENOMEM); + ctx->flags |= EXT4_WRITE_PATH_FL; + return ctx->w.bounce_page; +} + /** * ext4_encrypt() - Encrypts a page * @inode: The inode for which the encryption should take place @@ -413,27 +349,17 @@ struct page *ext4_encrypt(struct inode *inode, return (struct page *) ctx; /* The encryption operation will require a bounce page. */ - ciphertext_page = alloc_page(GFP_NOFS); - if (!ciphertext_page) { - /* This is a potential bottleneck, but at least we'll have - * forward progress. */ - ciphertext_page = mempool_alloc(ext4_bounce_page_pool, - GFP_NOFS); - if (WARN_ON_ONCE(!ciphertext_page)) { - ciphertext_page = mempool_alloc(ext4_bounce_page_pool, - GFP_NOFS | __GFP_WAIT); - } - ctx->flags &= ~EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL; - } else { - ctx->flags |= EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL; - } - ctx->bounce_page = ciphertext_page; - ctx->control_page = plaintext_page; + ciphertext_page = alloc_bounce_page(ctx); + if (IS_ERR(ciphertext_page)) + goto errout; + ctx->w.control_page = plaintext_page; err = ext4_page_crypto(ctx, inode, EXT4_ENCRYPT, plaintext_page->index, plaintext_page, ciphertext_page); if (err) { + ciphertext_page = ERR_PTR(err); + errout: ext4_release_crypto_ctx(ctx); - return ERR_PTR(err); + return ciphertext_page; } SetPagePrivate(ciphertext_page); set_page_private(ciphertext_page, (unsigned long)ctx); @@ -470,8 +396,8 @@ int ext4_decrypt_one(struct inode *inode, struct page *page) struct ext4_crypto_ctx *ctx = ext4_get_crypto_ctx(inode); - if (!ctx) - return -ENOMEM; + if (IS_ERR(ctx)) + return PTR_ERR(ctx); ret = ext4_decrypt(ctx, page); ext4_release_crypto_ctx(ctx); return ret; @@ -493,21 +419,11 @@ int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex) if (IS_ERR(ctx)) return PTR_ERR(ctx); - ciphertext_page = alloc_page(GFP_NOFS); - if (!ciphertext_page) { - /* This is a potential bottleneck, but at least we'll have - * forward progress. */ - ciphertext_page = mempool_alloc(ext4_bounce_page_pool, - GFP_NOFS); - if (WARN_ON_ONCE(!ciphertext_page)) { - ciphertext_page = mempool_alloc(ext4_bounce_page_pool, - GFP_NOFS | __GFP_WAIT); - } - ctx->flags &= ~EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL; - } else { - ctx->flags |= EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL; + ciphertext_page = alloc_bounce_page(ctx); + if (IS_ERR(ciphertext_page)) { + err = PTR_ERR(ciphertext_page); + goto errout; } - ctx->bounce_page = ciphertext_page; while (len--) { err = ext4_page_crypto(ctx, inode, EXT4_ENCRYPT, lblk, @@ -529,6 +445,7 @@ int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex) goto errout; } err = submit_bio_wait(WRITE, bio); + bio_put(bio); if (err) goto errout; } diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c index fded02f..7dc4eb5 100644 --- a/fs/ext4/crypto_fname.c +++ b/fs/ext4/crypto_fname.c @@ -48,6 +48,12 @@ bool ext4_valid_filenames_enc_mode(uint32_t mode) return (mode == EXT4_ENCRYPTION_MODE_AES_256_CTS); } +static unsigned max_name_len(struct inode *inode) +{ + return S_ISLNK(inode->i_mode) ? inode->i_sb->s_blocksize : + EXT4_NAME_LEN; +} + /** * ext4_fname_encrypt() - * @@ -55,43 +61,52 @@ bool ext4_valid_filenames_enc_mode(uint32_t mode) * ciphertext. Errors are returned as negative numbers. We trust the caller to * allocate sufficient memory to oname string. */ -static int ext4_fname_encrypt(struct ext4_fname_crypto_ctx *ctx, +static int ext4_fname_encrypt(struct inode *inode, const struct qstr *iname, struct ext4_str *oname) { u32 ciphertext_len; struct ablkcipher_request *req = NULL; DECLARE_EXT4_COMPLETION_RESULT(ecr); - struct crypto_ablkcipher *tfm = ctx->ctfm; + struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; + struct crypto_ablkcipher *tfm = ci->ci_ctfm; int res = 0; char iv[EXT4_CRYPTO_BLOCK_SIZE]; - struct scatterlist sg[1]; - int padding = 4 << (ctx->flags & EXT4_POLICY_FLAGS_PAD_MASK); - char *workbuf; + struct scatterlist src_sg, dst_sg; + int padding = 4 << (ci->ci_flags & EXT4_POLICY_FLAGS_PAD_MASK); + char *workbuf, buf[32], *alloc_buf = NULL; + unsigned lim = max_name_len(inode); - if (iname->len <= 0 || iname->len > ctx->lim) + if (iname->len <= 0 || iname->len > lim) return -EIO; ciphertext_len = (iname->len < EXT4_CRYPTO_BLOCK_SIZE) ? EXT4_CRYPTO_BLOCK_SIZE : iname->len; ciphertext_len = ext4_fname_crypto_round_up(ciphertext_len, padding); - ciphertext_len = (ciphertext_len > ctx->lim) - ? ctx->lim : ciphertext_len; + ciphertext_len = (ciphertext_len > lim) + ? lim : ciphertext_len; + + if (ciphertext_len <= sizeof(buf)) { + workbuf = buf; + } else { + alloc_buf = kmalloc(ciphertext_len, GFP_NOFS); + if (!alloc_buf) + return -ENOMEM; + workbuf = alloc_buf; + } /* Allocate request */ req = ablkcipher_request_alloc(tfm, GFP_NOFS); if (!req) { printk_ratelimited( KERN_ERR "%s: crypto_request_alloc() failed\n", __func__); + kfree(alloc_buf); return -ENOMEM; } ablkcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, ext4_dir_crypt_complete, &ecr); - /* Map the workpage */ - workbuf = kmap(ctx->workpage); - /* Copy the input */ memcpy(workbuf, iname->name, iname->len); if (iname->len < ciphertext_len) @@ -101,21 +116,16 @@ static int ext4_fname_encrypt(struct ext4_fname_crypto_ctx *ctx, memset(iv, 0, EXT4_CRYPTO_BLOCK_SIZE); /* Create encryption request */ - sg_init_table(sg, 1); - sg_set_page(sg, ctx->workpage, PAGE_SIZE, 0); - ablkcipher_request_set_crypt(req, sg, sg, ciphertext_len, iv); + sg_init_one(&src_sg, workbuf, ciphertext_len); + sg_init_one(&dst_sg, oname->name, ciphertext_len); + ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv); res = crypto_ablkcipher_encrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { BUG_ON(req->base.data != &ecr); wait_for_completion(&ecr.completion); res = ecr.res; } - if (res >= 0) { - /* Copy the result to output */ - memcpy(oname->name, workbuf, ciphertext_len); - res = ciphertext_len; - } - kunmap(ctx->workpage); + kfree(alloc_buf); ablkcipher_request_free(req); if (res < 0) { printk_ratelimited( @@ -132,20 +142,21 @@ static int ext4_fname_encrypt(struct ext4_fname_crypto_ctx *ctx, * Errors are returned as negative numbers. * We trust the caller to allocate sufficient memory to oname string. */ -static int ext4_fname_decrypt(struct ext4_fname_crypto_ctx *ctx, +static int ext4_fname_decrypt(struct inode *inode, const struct ext4_str *iname, struct ext4_str *oname) { struct ext4_str tmp_in[2], tmp_out[1]; struct ablkcipher_request *req = NULL; DECLARE_EXT4_COMPLETION_RESULT(ecr); - struct scatterlist sg[1]; - struct crypto_ablkcipher *tfm = ctx->ctfm; + struct scatterlist src_sg, dst_sg; + struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; + struct crypto_ablkcipher *tfm = ci->ci_ctfm; int res = 0; char iv[EXT4_CRYPTO_BLOCK_SIZE]; - char *workbuf; + unsigned lim = max_name_len(inode); - if (iname->len <= 0 || iname->len > ctx->lim) + if (iname->len <= 0 || iname->len > lim) return -EIO; tmp_in[0].name = iname->name; @@ -163,31 +174,19 @@ static int ext4_fname_decrypt(struct ext4_fname_crypto_ctx *ctx, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, ext4_dir_crypt_complete, &ecr); - /* Map the workpage */ - workbuf = kmap(ctx->workpage); - - /* Copy the input */ - memcpy(workbuf, iname->name, iname->len); - /* Initialize IV */ memset(iv, 0, EXT4_CRYPTO_BLOCK_SIZE); /* Create encryption request */ - sg_init_table(sg, 1); - sg_set_page(sg, ctx->workpage, PAGE_SIZE, 0); - ablkcipher_request_set_crypt(req, sg, sg, iname->len, iv); + sg_init_one(&src_sg, iname->name, iname->len); + sg_init_one(&dst_sg, oname->name, oname->len); + ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv); res = crypto_ablkcipher_decrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { BUG_ON(req->base.data != &ecr); wait_for_completion(&ecr.completion); res = ecr.res; } - if (res >= 0) { - /* Copy the result to output */ - memcpy(oname->name, workbuf, iname->len); - res = iname->len; - } - kunmap(ctx->workpage); ablkcipher_request_free(req); if (res < 0) { printk_ratelimited( @@ -254,207 +253,6 @@ static int digest_decode(const char *src, int len, char *dst) } /** - * ext4_free_fname_crypto_ctx() - - * - * Frees up a crypto context. - */ -void ext4_free_fname_crypto_ctx(struct ext4_fname_crypto_ctx *ctx) -{ - if (ctx == NULL || IS_ERR(ctx)) - return; - - if (ctx->ctfm && !IS_ERR(ctx->ctfm)) - crypto_free_ablkcipher(ctx->ctfm); - if (ctx->htfm && !IS_ERR(ctx->htfm)) - crypto_free_hash(ctx->htfm); - if (ctx->workpage && !IS_ERR(ctx->workpage)) - __free_page(ctx->workpage); - kfree(ctx); -} - -/** - * ext4_put_fname_crypto_ctx() - - * - * Return: The crypto context onto free list. If the free list is above a - * threshold, completely frees up the context, and returns the memory. - * - * TODO: Currently we directly free the crypto context. Eventually we should - * add code it to return to free list. Such an approach will increase - * efficiency of directory lookup. - */ -void ext4_put_fname_crypto_ctx(struct ext4_fname_crypto_ctx **ctx) -{ - if (*ctx == NULL || IS_ERR(*ctx)) - return; - ext4_free_fname_crypto_ctx(*ctx); - *ctx = NULL; -} - -/** - * ext4_search_fname_crypto_ctx() - - */ -static struct ext4_fname_crypto_ctx *ext4_search_fname_crypto_ctx( - const struct ext4_encryption_key *key) -{ - return NULL; -} - -/** - * ext4_alloc_fname_crypto_ctx() - - */ -struct ext4_fname_crypto_ctx *ext4_alloc_fname_crypto_ctx( - const struct ext4_encryption_key *key) -{ - struct ext4_fname_crypto_ctx *ctx; - - ctx = kmalloc(sizeof(struct ext4_fname_crypto_ctx), GFP_NOFS); - if (ctx == NULL) - return ERR_PTR(-ENOMEM); - if (key->mode == EXT4_ENCRYPTION_MODE_INVALID) { - /* This will automatically set key mode to invalid - * As enum for ENCRYPTION_MODE_INVALID is zero */ - memset(&ctx->key, 0, sizeof(ctx->key)); - } else { - memcpy(&ctx->key, key, sizeof(struct ext4_encryption_key)); - } - ctx->has_valid_key = (EXT4_ENCRYPTION_MODE_INVALID == key->mode) - ? 0 : 1; - ctx->ctfm_key_is_ready = 0; - ctx->ctfm = NULL; - ctx->htfm = NULL; - ctx->workpage = NULL; - return ctx; -} - -/** - * ext4_get_fname_crypto_ctx() - - * - * Allocates a free crypto context and initializes it to hold - * the crypto material for the inode. - * - * Return: NULL if not encrypted. Error value on error. Valid pointer otherwise. - */ -struct ext4_fname_crypto_ctx *ext4_get_fname_crypto_ctx( - struct inode *inode, u32 max_ciphertext_len) -{ - struct ext4_fname_crypto_ctx *ctx; - struct ext4_inode_info *ei = EXT4_I(inode); - int res; - - /* Check if the crypto policy is set on the inode */ - res = ext4_encrypted_inode(inode); - if (res == 0) - return NULL; - - if (!ext4_has_encryption_key(inode)) - ext4_generate_encryption_key(inode); - - /* Get a crypto context based on the key. - * A new context is allocated if no context matches the requested key. - */ - ctx = ext4_search_fname_crypto_ctx(&(ei->i_encryption_key)); - if (ctx == NULL) - ctx = ext4_alloc_fname_crypto_ctx(&(ei->i_encryption_key)); - if (IS_ERR(ctx)) - return ctx; - - ctx->flags = ei->i_crypt_policy_flags; - if (ctx->has_valid_key) { - if (ctx->key.mode != EXT4_ENCRYPTION_MODE_AES_256_CTS) { - printk_once(KERN_WARNING - "ext4: unsupported key mode %d\n", - ctx->key.mode); - return ERR_PTR(-ENOKEY); - } - - /* As a first cut, we will allocate new tfm in every call. - * later, we will keep the tfm around, in case the key gets - * re-used */ - if (ctx->ctfm == NULL) { - ctx->ctfm = crypto_alloc_ablkcipher("cts(cbc(aes))", - 0, 0); - } - if (IS_ERR(ctx->ctfm)) { - res = PTR_ERR(ctx->ctfm); - printk( - KERN_DEBUG "%s: error (%d) allocating crypto tfm\n", - __func__, res); - ctx->ctfm = NULL; - ext4_put_fname_crypto_ctx(&ctx); - return ERR_PTR(res); - } - if (ctx->ctfm == NULL) { - printk( - KERN_DEBUG "%s: could not allocate crypto tfm\n", - __func__); - ext4_put_fname_crypto_ctx(&ctx); - return ERR_PTR(-ENOMEM); - } - if (ctx->workpage == NULL) - ctx->workpage = alloc_page(GFP_NOFS); - if (IS_ERR(ctx->workpage)) { - res = PTR_ERR(ctx->workpage); - printk( - KERN_DEBUG "%s: error (%d) allocating work page\n", - __func__, res); - ctx->workpage = NULL; - ext4_put_fname_crypto_ctx(&ctx); - return ERR_PTR(res); - } - if (ctx->workpage == NULL) { - printk( - KERN_DEBUG "%s: could not allocate work page\n", - __func__); - ext4_put_fname_crypto_ctx(&ctx); - return ERR_PTR(-ENOMEM); - } - ctx->lim = max_ciphertext_len; - crypto_ablkcipher_clear_flags(ctx->ctfm, ~0); - crypto_tfm_set_flags(crypto_ablkcipher_tfm(ctx->ctfm), - CRYPTO_TFM_REQ_WEAK_KEY); - - /* If we are lucky, we will get a context that is already - * set up with the right key. Else, we will have to - * set the key */ - if (!ctx->ctfm_key_is_ready) { - /* Since our crypto objectives for filename encryption - * are pretty weak, - * we directly use the inode master key */ - res = crypto_ablkcipher_setkey(ctx->ctfm, - ctx->key.raw, ctx->key.size); - if (res) { - ext4_put_fname_crypto_ctx(&ctx); - return ERR_PTR(-EIO); - } - ctx->ctfm_key_is_ready = 1; - } else { - /* In the current implementation, key should never be - * marked "ready" for a context that has just been - * allocated. So we should never reach here */ - BUG(); - } - } - if (ctx->htfm == NULL) - ctx->htfm = crypto_alloc_hash("sha256", 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(ctx->htfm)) { - res = PTR_ERR(ctx->htfm); - printk(KERN_DEBUG "%s: error (%d) allocating hash tfm\n", - __func__, res); - ctx->htfm = NULL; - ext4_put_fname_crypto_ctx(&ctx); - return ERR_PTR(res); - } - if (ctx->htfm == NULL) { - printk(KERN_DEBUG "%s: could not allocate hash tfm\n", - __func__); - ext4_put_fname_crypto_ctx(&ctx); - return ERR_PTR(-ENOMEM); - } - - return ctx; -} - -/** * ext4_fname_crypto_round_up() - * * Return: The next multiple of block size @@ -464,44 +262,29 @@ u32 ext4_fname_crypto_round_up(u32 size, u32 blksize) return ((size+blksize-1)/blksize)*blksize; } -/** - * ext4_fname_crypto_namelen_on_disk() - - */ -int ext4_fname_crypto_namelen_on_disk(struct ext4_fname_crypto_ctx *ctx, - u32 namelen) +unsigned ext4_fname_encrypted_size(struct inode *inode, u32 ilen) { - u32 ciphertext_len; - int padding = 4 << (ctx->flags & EXT4_POLICY_FLAGS_PAD_MASK); - - if (ctx == NULL) - return -EIO; - if (!(ctx->has_valid_key)) - return -EACCES; - ciphertext_len = (namelen < EXT4_CRYPTO_BLOCK_SIZE) ? - EXT4_CRYPTO_BLOCK_SIZE : namelen; - ciphertext_len = ext4_fname_crypto_round_up(ciphertext_len, padding); - ciphertext_len = (ciphertext_len > ctx->lim) - ? ctx->lim : ciphertext_len; - return (int) ciphertext_len; + struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; + int padding = 32; + + if (ci) + padding = 4 << (ci->ci_flags & EXT4_POLICY_FLAGS_PAD_MASK); + if (ilen < EXT4_CRYPTO_BLOCK_SIZE) + ilen = EXT4_CRYPTO_BLOCK_SIZE; + return ext4_fname_crypto_round_up(ilen, padding); } -/** - * ext4_fname_crypto_alloc_obuff() - +/* + * ext4_fname_crypto_alloc_buffer() - * * Allocates an output buffer that is sufficient for the crypto operation * specified by the context and the direction. */ -int ext4_fname_crypto_alloc_buffer(struct ext4_fname_crypto_ctx *ctx, +int ext4_fname_crypto_alloc_buffer(struct inode *inode, u32 ilen, struct ext4_str *crypto_str) { - unsigned int olen; - int padding = 4 << (ctx->flags & EXT4_POLICY_FLAGS_PAD_MASK); + unsigned int olen = ext4_fname_encrypted_size(inode, ilen); - if (!ctx) - return -EIO; - if (padding < EXT4_CRYPTO_BLOCK_SIZE) - padding = EXT4_CRYPTO_BLOCK_SIZE; - olen = ext4_fname_crypto_round_up(ilen, padding); crypto_str->len = olen; if (olen < EXT4_FNAME_CRYPTO_DIGEST_SIZE*2) olen = EXT4_FNAME_CRYPTO_DIGEST_SIZE*2; @@ -529,7 +312,7 @@ void ext4_fname_crypto_free_buffer(struct ext4_str *crypto_str) /** * ext4_fname_disk_to_usr() - converts a filename from disk space to user space */ -int _ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, +int _ext4_fname_disk_to_usr(struct inode *inode, struct dx_hash_info *hinfo, const struct ext4_str *iname, struct ext4_str *oname) @@ -537,8 +320,6 @@ int _ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, char buf[24]; int ret; - if (ctx == NULL) - return -EIO; if (iname->len < 3) { /*Check for . and .. */ if (iname->name[0] == '.' && iname->name[iname->len-1] == '.') { @@ -548,8 +329,8 @@ int _ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, return oname->len; } } - if (ctx->has_valid_key) - return ext4_fname_decrypt(ctx, iname, oname); + if (EXT4_I(inode)->i_crypt_info) + return ext4_fname_decrypt(inode, iname, oname); if (iname->len <= EXT4_FNAME_CRYPTO_DIGEST_SIZE) { ret = digest_encode(iname->name, iname->len, oname->name); @@ -568,7 +349,7 @@ int _ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, return ret + 1; } -int ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, +int ext4_fname_disk_to_usr(struct inode *inode, struct dx_hash_info *hinfo, const struct ext4_dir_entry_2 *de, struct ext4_str *oname) @@ -576,21 +357,20 @@ int ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, struct ext4_str iname = {.name = (unsigned char *) de->name, .len = de->name_len }; - return _ext4_fname_disk_to_usr(ctx, hinfo, &iname, oname); + return _ext4_fname_disk_to_usr(inode, hinfo, &iname, oname); } /** * ext4_fname_usr_to_disk() - converts a filename from user space to disk space */ -int ext4_fname_usr_to_disk(struct ext4_fname_crypto_ctx *ctx, +int ext4_fname_usr_to_disk(struct inode *inode, const struct qstr *iname, struct ext4_str *oname) { int res; + struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; - if (ctx == NULL) - return -EIO; if (iname->len < 3) { /*Check for . and .. */ if (iname->name[0] == '.' && @@ -601,8 +381,8 @@ int ext4_fname_usr_to_disk(struct ext4_fname_crypto_ctx *ctx, return oname->len; } } - if (ctx->has_valid_key) { - res = ext4_fname_encrypt(ctx, iname, oname); + if (ci) { + res = ext4_fname_encrypt(inode, iname, oname); return res; } /* Without a proper key, a user is not allowed to modify the filenames @@ -611,109 +391,79 @@ int ext4_fname_usr_to_disk(struct ext4_fname_crypto_ctx *ctx, return -EACCES; } -/* - * Calculate the htree hash from a filename from user space - */ -int ext4_fname_usr_to_hash(struct ext4_fname_crypto_ctx *ctx, - const struct qstr *iname, - struct dx_hash_info *hinfo) +int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, + int lookup, struct ext4_filename *fname) { - struct ext4_str tmp; - int ret = 0; - char buf[EXT4_FNAME_CRYPTO_DIGEST_SIZE+1]; + struct ext4_crypt_info *ci; + int ret = 0, bigname = 0; + + memset(fname, 0, sizeof(struct ext4_filename)); + fname->usr_fname = iname; - if (!ctx || + if (!ext4_encrypted_inode(dir) || ((iname->name[0] == '.') && ((iname->len == 1) || ((iname->name[1] == '.') && (iname->len == 2))))) { - ext4fs_dirhash(iname->name, iname->len, hinfo); + fname->disk_name.name = (unsigned char *) iname->name; + fname->disk_name.len = iname->len; return 0; } - - if (!ctx->has_valid_key && iname->name[0] == '_') { - if (iname->len != 33) - return -ENOENT; - ret = digest_decode(iname->name+1, iname->len, buf); - if (ret != 24) - return -ENOENT; - memcpy(&hinfo->hash, buf, 4); - memcpy(&hinfo->minor_hash, buf + 4, 4); + ret = ext4_get_encryption_info(dir); + if (ret) + return ret; + ci = EXT4_I(dir)->i_crypt_info; + if (ci) { + ret = ext4_fname_crypto_alloc_buffer(dir, iname->len, + &fname->crypto_buf); + if (ret < 0) + return ret; + ret = ext4_fname_encrypt(dir, iname, &fname->crypto_buf); + if (ret < 0) + goto errout; + fname->disk_name.name = fname->crypto_buf.name; + fname->disk_name.len = fname->crypto_buf.len; return 0; } + if (!lookup) + return -EACCES; - if (!ctx->has_valid_key && iname->name[0] != '_') { - if (iname->len > 43) - return -ENOENT; - ret = digest_decode(iname->name, iname->len, buf); - ext4fs_dirhash(buf, ret, hinfo); - return 0; + /* We don't have the key and we are doing a lookup; decode the + * user-supplied name + */ + if (iname->name[0] == '_') + bigname = 1; + if ((bigname && (iname->len != 33)) || + (!bigname && (iname->len > 43))) + return -ENOENT; + + fname->crypto_buf.name = kmalloc(32, GFP_KERNEL); + if (fname->crypto_buf.name == NULL) + return -ENOMEM; + ret = digest_decode(iname->name + bigname, iname->len - bigname, + fname->crypto_buf.name); + if (ret < 0) { + ret = -ENOENT; + goto errout; } - - /* First encrypt the plaintext name */ - ret = ext4_fname_crypto_alloc_buffer(ctx, iname->len, &tmp); - if (ret < 0) - return ret; - - ret = ext4_fname_encrypt(ctx, iname, &tmp); - if (ret >= 0) { - ext4fs_dirhash(tmp.name, tmp.len, hinfo); - ret = 0; + fname->crypto_buf.len = ret; + if (bigname) { + memcpy(&fname->hinfo.hash, fname->crypto_buf.name, 4); + memcpy(&fname->hinfo.minor_hash, fname->crypto_buf.name + 4, 4); + } else { + fname->disk_name.name = fname->crypto_buf.name; + fname->disk_name.len = fname->crypto_buf.len; } - - ext4_fname_crypto_free_buffer(&tmp); + return 0; +errout: + kfree(fname->crypto_buf.name); + fname->crypto_buf.name = NULL; return ret; } -int ext4_fname_match(struct ext4_fname_crypto_ctx *ctx, struct ext4_str *cstr, - int len, const char * const name, - struct ext4_dir_entry_2 *de) +void ext4_fname_free_filename(struct ext4_filename *fname) { - int ret = -ENOENT; - int bigname = (*name == '_'); - - if (ctx->has_valid_key) { - if (cstr->name == NULL) { - struct qstr istr; - - ret = ext4_fname_crypto_alloc_buffer(ctx, len, cstr); - if (ret < 0) - goto errout; - istr.name = name; - istr.len = len; - ret = ext4_fname_encrypt(ctx, &istr, cstr); - if (ret < 0) - goto errout; - } - } else { - if (cstr->name == NULL) { - cstr->name = kmalloc(32, GFP_KERNEL); - if (cstr->name == NULL) - return -ENOMEM; - if ((bigname && (len != 33)) || - (!bigname && (len > 43))) - goto errout; - ret = digest_decode(name+bigname, len-bigname, - cstr->name); - if (ret < 0) { - ret = -ENOENT; - goto errout; - } - cstr->len = ret; - } - if (bigname) { - if (de->name_len < 16) - return 0; - ret = memcmp(de->name + de->name_len - 16, - cstr->name + 8, 16); - return (ret == 0) ? 1 : 0; - } - } - if (de->name_len != cstr->len) - return 0; - ret = memcmp(de->name, cstr->name, cstr->len); - return (ret == 0) ? 1 : 0; -errout: - kfree(cstr->name); - cstr->name = NULL; - return ret; + kfree(fname->crypto_buf.name); + fname->crypto_buf.name = NULL; + fname->usr_fname = NULL; + fname->disk_name.name = NULL; } diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c index 52170d0..442d24e 100644 --- a/fs/ext4/crypto_key.c +++ b/fs/ext4/crypto_key.c @@ -84,14 +84,38 @@ out: return res; } -/** - * ext4_generate_encryption_key() - generates an encryption key - * @inode: The inode to generate the encryption key for. - */ -int ext4_generate_encryption_key(struct inode *inode) +void ext4_free_crypt_info(struct ext4_crypt_info *ci) +{ + if (!ci) + return; + + if (ci->ci_keyring_key) + key_put(ci->ci_keyring_key); + crypto_free_ablkcipher(ci->ci_ctfm); + kmem_cache_free(ext4_crypt_info_cachep, ci); +} + +void ext4_free_encryption_info(struct inode *inode, + struct ext4_crypt_info *ci) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_crypt_info *prev; + + if (ci == NULL) + ci = ACCESS_ONCE(ei->i_crypt_info); + if (ci == NULL) + return; + prev = cmpxchg(&ei->i_crypt_info, ci, NULL); + if (prev != ci) + return; + + ext4_free_crypt_info(ci); +} + +int _ext4_get_encryption_info(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_encryption_key *crypt_key = &ei->i_encryption_key; + struct ext4_crypt_info *crypt_info; char full_key_descriptor[EXT4_KEY_DESC_PREFIX_SIZE + (EXT4_KEY_DESCRIPTOR_SIZE * 2) + 1]; struct key *keyring_key = NULL; @@ -99,31 +123,76 @@ int ext4_generate_encryption_key(struct inode *inode) struct ext4_encryption_context ctx; struct user_key_payload *ukp; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - int res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, - EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, - &ctx, sizeof(ctx)); + struct crypto_ablkcipher *ctfm; + const char *cipher_str; + char raw_key[EXT4_MAX_KEY_SIZE]; + char mode; + int res; - if (res != sizeof(ctx)) { - if (res > 0) - res = -EINVAL; - goto out; + if (!ext4_read_workqueue) { + res = ext4_init_crypto(); + if (res) + return res; + } + +retry: + crypt_info = ACCESS_ONCE(ei->i_crypt_info); + if (crypt_info) { + if (!crypt_info->ci_keyring_key || + key_validate(crypt_info->ci_keyring_key) == 0) + return 0; + ext4_free_encryption_info(inode, crypt_info); + goto retry; } + + res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, + &ctx, sizeof(ctx)); + if (res < 0) { + if (!DUMMY_ENCRYPTION_ENABLED(sbi)) + return res; + ctx.contents_encryption_mode = EXT4_ENCRYPTION_MODE_AES_256_XTS; + ctx.filenames_encryption_mode = + EXT4_ENCRYPTION_MODE_AES_256_CTS; + ctx.flags = 0; + } else if (res != sizeof(ctx)) + return -EINVAL; res = 0; - ei->i_crypt_policy_flags = ctx.flags; + crypt_info = kmem_cache_alloc(ext4_crypt_info_cachep, GFP_KERNEL); + if (!crypt_info) + return -ENOMEM; + + crypt_info->ci_flags = ctx.flags; + crypt_info->ci_data_mode = ctx.contents_encryption_mode; + crypt_info->ci_filename_mode = ctx.filenames_encryption_mode; + crypt_info->ci_ctfm = NULL; + crypt_info->ci_keyring_key = NULL; + memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor, + sizeof(crypt_info->ci_master_key)); if (S_ISREG(inode->i_mode)) - crypt_key->mode = ctx.contents_encryption_mode; + mode = crypt_info->ci_data_mode; else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - crypt_key->mode = ctx.filenames_encryption_mode; - else { - printk(KERN_ERR "ext4 crypto: Unsupported inode type.\n"); + mode = crypt_info->ci_filename_mode; + else BUG(); + switch (mode) { + case EXT4_ENCRYPTION_MODE_AES_256_XTS: + cipher_str = "xts(aes)"; + break; + case EXT4_ENCRYPTION_MODE_AES_256_CTS: + cipher_str = "cts(cbc(aes))"; + break; + default: + printk_once(KERN_WARNING + "ext4: unsupported key mode %d (ino %u)\n", + mode, (unsigned) inode->i_ino); + res = -ENOKEY; + goto out; } - crypt_key->size = ext4_encryption_key_size(crypt_key->mode); - BUG_ON(!crypt_key->size); if (DUMMY_ENCRYPTION_ENABLED(sbi)) { - memset(crypt_key->raw, 0x42, EXT4_AES_256_XTS_KEY_SIZE); - goto out; + memset(raw_key, 0x42, EXT4_AES_256_XTS_KEY_SIZE); + goto got_key; } memcpy(full_key_descriptor, EXT4_KEY_DESC_PREFIX, EXT4_KEY_DESC_PREFIX_SIZE); @@ -138,6 +207,7 @@ int ext4_generate_encryption_key(struct inode *inode) keyring_key = NULL; goto out; } + crypt_info->ci_keyring_key = keyring_key; BUG_ON(keyring_key->type != &key_type_logon); ukp = ((struct user_key_payload *)keyring_key->payload.data); if (ukp->datalen != sizeof(struct ext4_encryption_key)) { @@ -148,19 +218,43 @@ int ext4_generate_encryption_key(struct inode *inode) BUILD_BUG_ON(EXT4_AES_128_ECB_KEY_SIZE != EXT4_KEY_DERIVATION_NONCE_SIZE); BUG_ON(master_key->size != EXT4_AES_256_XTS_KEY_SIZE); - res = ext4_derive_key_aes(ctx.nonce, master_key->raw, crypt_key->raw); + res = ext4_derive_key_aes(ctx.nonce, master_key->raw, + raw_key); +got_key: + ctfm = crypto_alloc_ablkcipher(cipher_str, 0, 0); + if (!ctfm || IS_ERR(ctfm)) { + res = ctfm ? PTR_ERR(ctfm) : -ENOMEM; + printk(KERN_DEBUG + "%s: error %d (inode %u) allocating crypto tfm\n", + __func__, res, (unsigned) inode->i_ino); + goto out; + } + crypt_info->ci_ctfm = ctfm; + crypto_ablkcipher_clear_flags(ctfm, ~0); + crypto_tfm_set_flags(crypto_ablkcipher_tfm(ctfm), + CRYPTO_TFM_REQ_WEAK_KEY); + res = crypto_ablkcipher_setkey(ctfm, raw_key, + ext4_encryption_key_size(mode)); + if (res) + goto out; + memzero_explicit(raw_key, sizeof(raw_key)); + if (cmpxchg(&ei->i_crypt_info, NULL, crypt_info) != NULL) { + ext4_free_crypt_info(crypt_info); + goto retry; + } + return 0; + out: - if (keyring_key) - key_put(keyring_key); - if (res < 0) - crypt_key->mode = EXT4_ENCRYPTION_MODE_INVALID; + if (res == -ENOKEY) + res = 0; + ext4_free_crypt_info(crypt_info); + memzero_explicit(raw_key, sizeof(raw_key)); return res; } int ext4_has_encryption_key(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_encryption_key *crypt_key = &ei->i_encryption_key; - return (crypt_key->mode != EXT4_ENCRYPTION_MODE_INVALID); + return (ei->i_crypt_info != NULL); } diff --git a/fs/ext4/crypto_policy.c b/fs/ext4/crypto_policy.c index a6d6291..02c4e5d 100644 --- a/fs/ext4/crypto_policy.c +++ b/fs/ext4/crypto_policy.c @@ -51,6 +51,10 @@ static int ext4_create_encryption_context_from_policy( struct ext4_encryption_context ctx; int res = 0; + res = ext4_convert_inline_data(inode); + if (res) + return res; + ctx.format = EXT4_ENCRYPTION_CONTEXT_FORMAT_V1; memcpy(ctx.master_key_descriptor, policy->master_key_descriptor, EXT4_KEY_DESCRIPTOR_SIZE); @@ -89,6 +93,8 @@ int ext4_process_policy(const struct ext4_encryption_policy *policy, return -EINVAL; if (!ext4_inode_has_encryption_context(inode)) { + if (!S_ISDIR(inode->i_mode)) + return -EINVAL; if (!ext4_empty_dir(inode)) return -ENOTEMPTY; return ext4_create_encryption_context_from_policy(inode, @@ -126,7 +132,7 @@ int ext4_get_policy(struct inode *inode, struct ext4_encryption_policy *policy) int ext4_is_child_context_consistent_with_parent(struct inode *parent, struct inode *child) { - struct ext4_encryption_context parent_ctx, child_ctx; + struct ext4_crypt_info *parent_ci, *child_ci; int res; if ((parent == NULL) || (child == NULL)) { @@ -136,26 +142,28 @@ int ext4_is_child_context_consistent_with_parent(struct inode *parent, /* no restrictions if the parent directory is not encrypted */ if (!ext4_encrypted_inode(parent)) return 1; - res = ext4_xattr_get(parent, EXT4_XATTR_INDEX_ENCRYPTION, - EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, - &parent_ctx, sizeof(parent_ctx)); - if (res != sizeof(parent_ctx)) - return 0; /* if the child directory is not encrypted, this is always a problem */ if (!ext4_encrypted_inode(child)) return 0; - res = ext4_xattr_get(child, EXT4_XATTR_INDEX_ENCRYPTION, - EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, - &child_ctx, sizeof(child_ctx)); - if (res != sizeof(child_ctx)) + res = ext4_get_encryption_info(parent); + if (res) return 0; - return (memcmp(parent_ctx.master_key_descriptor, - child_ctx.master_key_descriptor, + res = ext4_get_encryption_info(child); + if (res) + return 0; + parent_ci = EXT4_I(parent)->i_crypt_info; + child_ci = EXT4_I(child)->i_crypt_info; + if (!parent_ci && !child_ci) + return 1; + if (!parent_ci || !child_ci) + return 0; + + return (memcmp(parent_ci->ci_master_key, + child_ci->ci_master_key, EXT4_KEY_DESCRIPTOR_SIZE) == 0 && - (parent_ctx.contents_encryption_mode == - child_ctx.contents_encryption_mode) && - (parent_ctx.filenames_encryption_mode == - child_ctx.filenames_encryption_mode)); + (parent_ci->ci_data_mode == child_ci->ci_data_mode) && + (parent_ci->ci_filename_mode == child_ci->ci_filename_mode) && + (parent_ci->ci_flags == child_ci->ci_flags)); } /** @@ -168,31 +176,40 @@ int ext4_is_child_context_consistent_with_parent(struct inode *parent, int ext4_inherit_context(struct inode *parent, struct inode *child) { struct ext4_encryption_context ctx; - int res = ext4_xattr_get(parent, EXT4_XATTR_INDEX_ENCRYPTION, - EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, - &ctx, sizeof(ctx)); + struct ext4_crypt_info *ci; + int res; + + res = ext4_get_encryption_info(parent); + if (res < 0) + return res; + ci = EXT4_I(parent)->i_crypt_info; + if (ci == NULL) + return -ENOKEY; - if (res != sizeof(ctx)) { - if (DUMMY_ENCRYPTION_ENABLED(EXT4_SB(parent->i_sb))) { - ctx.format = EXT4_ENCRYPTION_CONTEXT_FORMAT_V1; - ctx.contents_encryption_mode = - EXT4_ENCRYPTION_MODE_AES_256_XTS; - ctx.filenames_encryption_mode = - EXT4_ENCRYPTION_MODE_AES_256_CTS; - ctx.flags = 0; - memset(ctx.master_key_descriptor, 0x42, - EXT4_KEY_DESCRIPTOR_SIZE); - res = 0; - } else { - goto out; - } + ctx.format = EXT4_ENCRYPTION_CONTEXT_FORMAT_V1; + if (DUMMY_ENCRYPTION_ENABLED(EXT4_SB(parent->i_sb))) { + ctx.contents_encryption_mode = EXT4_ENCRYPTION_MODE_AES_256_XTS; + ctx.filenames_encryption_mode = + EXT4_ENCRYPTION_MODE_AES_256_CTS; + ctx.flags = 0; + memset(ctx.master_key_descriptor, 0x42, + EXT4_KEY_DESCRIPTOR_SIZE); + res = 0; + } else { + ctx.contents_encryption_mode = ci->ci_data_mode; + ctx.filenames_encryption_mode = ci->ci_filename_mode; + ctx.flags = ci->ci_flags; + memcpy(ctx.master_key_descriptor, ci->ci_master_key, + EXT4_KEY_DESCRIPTOR_SIZE); } get_random_bytes(ctx.nonce, EXT4_KEY_DERIVATION_NONCE_SIZE); res = ext4_xattr_set(child, EXT4_XATTR_INDEX_ENCRYPTION, EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, sizeof(ctx), 0); -out: - if (!res) + if (!res) { ext4_set_inode_flag(child, EXT4_INODE_ENCRYPT); + ext4_clear_inode_state(child, EXT4_STATE_MAY_INLINE_DATA); + res = ext4_get_encryption_info(child); + } return res; } diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 5665d82..f9e1491 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -110,7 +110,6 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) struct super_block *sb = inode->i_sb; struct buffer_head *bh = NULL; int dir_has_error = 0; - struct ext4_fname_crypto_ctx *enc_ctx = NULL; struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}; if (is_dx_dir(inode)) { @@ -134,16 +133,11 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) return err; } - enc_ctx = ext4_get_fname_crypto_ctx(inode, EXT4_NAME_LEN); - if (IS_ERR(enc_ctx)) - return PTR_ERR(enc_ctx); - if (enc_ctx) { - err = ext4_fname_crypto_alloc_buffer(enc_ctx, EXT4_NAME_LEN, + if (ext4_encrypted_inode(inode)) { + err = ext4_fname_crypto_alloc_buffer(inode, EXT4_NAME_LEN, &fname_crypto_str); - if (err < 0) { - ext4_put_fname_crypto_ctx(&enc_ctx); + if (err < 0) return err; - } } offset = ctx->pos & (sb->s_blocksize - 1); @@ -239,17 +233,19 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); if (le32_to_cpu(de->inode)) { - if (enc_ctx == NULL) { - /* Directory is not encrypted */ + if (!ext4_encrypted_inode(inode)) { if (!dir_emit(ctx, de->name, de->name_len, le32_to_cpu(de->inode), get_dtype(sb, de->file_type))) goto done; } else { + int save_len = fname_crypto_str.len; + /* Directory is encrypted */ - err = ext4_fname_disk_to_usr(enc_ctx, + err = ext4_fname_disk_to_usr(inode, NULL, de, &fname_crypto_str); + fname_crypto_str.len = save_len; if (err < 0) goto errout; if (!dir_emit(ctx, @@ -272,7 +268,6 @@ done: err = 0; errout: #ifdef CONFIG_EXT4_FS_ENCRYPTION - ext4_put_fname_crypto_ctx(&enc_ctx); ext4_fname_crypto_free_buffer(&fname_crypto_str); #endif brelse(bh); @@ -598,6 +593,13 @@ finished: return 0; } +static int ext4_dir_open(struct inode * inode, struct file * filp) +{ + if (ext4_encrypted_inode(inode)) + return ext4_get_encryption_info(inode) ? -EACCES : 0; + return 0; +} + static int ext4_release_dir(struct inode *inode, struct file *filp) { if (filp->private_data) @@ -640,5 +642,6 @@ const struct file_operations ext4_dir_operations = { .compat_ioctl = ext4_compat_ioctl, #endif .fsync = ext4_sync_file, + .open = ext4_dir_open, .release = ext4_release_dir, }; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 0a3b72d..f5e9f04 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -69,15 +69,6 @@ #define ext_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif -#define EXT4_ERROR_INODE(inode, fmt, a...) \ - ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a) - -#define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...) \ - ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a) - -#define EXT4_ERROR_FILE(file, block, fmt, a...) \ - ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a) - /* data type for block offset of block group */ typedef int ext4_grpblk_t; @@ -90,6 +81,11 @@ typedef __u32 ext4_lblk_t; /* data type for block group number */ typedef unsigned int ext4_group_t; +enum SHIFT_DIRECTION { + SHIFT_LEFT = 0, + SHIFT_RIGHT, +}; + /* * Flags used in mballoc's allocation_context flags field. * @@ -911,7 +907,6 @@ struct ext4_inode_info { /* on-disk additional length */ __u16 i_extra_isize; - char i_crypt_policy_flags; /* Indicate the inline data space. */ u16 i_inline_off; @@ -955,7 +950,7 @@ struct ext4_inode_info { #ifdef CONFIG_EXT4_FS_ENCRYPTION /* Encryption params */ - struct ext4_encryption_key i_encryption_key; + struct ext4_crypt_info *i_crypt_info; #endif }; @@ -1374,12 +1369,6 @@ struct ext4_sb_info { struct ratelimit_state s_err_ratelimit_state; struct ratelimit_state s_warning_ratelimit_state; struct ratelimit_state s_msg_ratelimit_state; - -#ifdef CONFIG_EXT4_FS_ENCRYPTION - /* Encryption */ - uint32_t s_file_encryption_mode; - uint32_t s_dir_encryption_mode; -#endif }; static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) @@ -1838,6 +1827,17 @@ struct dx_hash_info */ #define HASH_NB_ALWAYS 1 +struct ext4_filename { + const struct qstr *usr_fname; + struct ext4_str disk_name; + struct dx_hash_info hinfo; +#ifdef CONFIG_EXT4_FS_ENCRYPTION + struct ext4_str crypto_buf; +#endif +}; + +#define fname_name(p) ((p)->disk_name.name) +#define fname_len(p) ((p)->disk_name.len) /* * Describe an inode's exact location on disk and in memory @@ -2054,6 +2054,7 @@ int ext4_get_policy(struct inode *inode, struct ext4_encryption_policy *policy); /* crypto.c */ +extern struct kmem_cache *ext4_crypt_info_cachep; bool ext4_valid_contents_enc_mode(uint32_t mode); uint32_t ext4_validate_encryption_key_size(uint32_t mode, uint32_t size); extern struct workqueue_struct *ext4_read_workqueue; @@ -2085,57 +2086,84 @@ static inline int ext4_sb_has_crypto(struct super_block *sb) /* crypto_fname.c */ bool ext4_valid_filenames_enc_mode(uint32_t mode); u32 ext4_fname_crypto_round_up(u32 size, u32 blksize); -int ext4_fname_crypto_alloc_buffer(struct ext4_fname_crypto_ctx *ctx, +unsigned ext4_fname_encrypted_size(struct inode *inode, u32 ilen); +int ext4_fname_crypto_alloc_buffer(struct inode *inode, u32 ilen, struct ext4_str *crypto_str); -int _ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, +int _ext4_fname_disk_to_usr(struct inode *inode, struct dx_hash_info *hinfo, const struct ext4_str *iname, struct ext4_str *oname); -int ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, +int ext4_fname_disk_to_usr(struct inode *inode, struct dx_hash_info *hinfo, const struct ext4_dir_entry_2 *de, struct ext4_str *oname); -int ext4_fname_usr_to_disk(struct ext4_fname_crypto_ctx *ctx, +int ext4_fname_usr_to_disk(struct inode *inode, const struct qstr *iname, struct ext4_str *oname); -int ext4_fname_usr_to_hash(struct ext4_fname_crypto_ctx *ctx, - const struct qstr *iname, - struct dx_hash_info *hinfo); -int ext4_fname_crypto_namelen_on_disk(struct ext4_fname_crypto_ctx *ctx, - u32 namelen); -int ext4_fname_match(struct ext4_fname_crypto_ctx *ctx, struct ext4_str *cstr, - int len, const char * const name, - struct ext4_dir_entry_2 *de); - - #ifdef CONFIG_EXT4_FS_ENCRYPTION -void ext4_put_fname_crypto_ctx(struct ext4_fname_crypto_ctx **ctx); -struct ext4_fname_crypto_ctx *ext4_get_fname_crypto_ctx(struct inode *inode, - u32 max_len); void ext4_fname_crypto_free_buffer(struct ext4_str *crypto_str); +int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, + int lookup, struct ext4_filename *fname); +void ext4_fname_free_filename(struct ext4_filename *fname); #else static inline -void ext4_put_fname_crypto_ctx(struct ext4_fname_crypto_ctx **ctx) { } -static inline -struct ext4_fname_crypto_ctx *ext4_get_fname_crypto_ctx(struct inode *inode, - u32 max_len) +int ext4_setup_fname_crypto(struct inode *inode) { - return NULL; + return 0; } static inline void ext4_fname_crypto_free_buffer(struct ext4_str *p) { } +static inline int ext4_fname_setup_filename(struct inode *dir, + const struct qstr *iname, + int lookup, struct ext4_filename *fname) +{ + fname->usr_fname = iname; + fname->disk_name.name = (unsigned char *) iname->name; + fname->disk_name.len = iname->len; + return 0; +} +static inline void ext4_fname_free_filename(struct ext4_filename *fname) { } #endif /* crypto_key.c */ -int ext4_generate_encryption_key(struct inode *inode); +void ext4_free_crypt_info(struct ext4_crypt_info *ci); +void ext4_free_encryption_info(struct inode *inode, struct ext4_crypt_info *ci); +int _ext4_get_encryption_info(struct inode *inode); #ifdef CONFIG_EXT4_FS_ENCRYPTION int ext4_has_encryption_key(struct inode *inode); + +static inline int ext4_get_encryption_info(struct inode *inode) +{ + struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; + + if (!ci || + (ci->ci_keyring_key && + (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) | + (1 << KEY_FLAG_REVOKED) | + (1 << KEY_FLAG_DEAD))))) + return _ext4_get_encryption_info(inode); + return 0; +} + +static inline struct ext4_crypt_info *ext4_encryption_info(struct inode *inode) +{ + return EXT4_I(inode)->i_crypt_info; +} + #else static inline int ext4_has_encryption_key(struct inode *inode) { return 0; } +static inline int ext4_get_encryption_info(struct inode *inode) +{ + return 0; +} +static inline struct ext4_crypt_info *ext4_encryption_info(struct inode *inode) +{ + return NULL; +} #endif @@ -2156,14 +2184,13 @@ extern void ext4_htree_free_dir_info(struct dir_private_info *p); extern int ext4_find_dest_de(struct inode *dir, struct inode *inode, struct buffer_head *bh, void *buf, int buf_size, - const char *name, int namelen, + struct ext4_filename *fname, struct ext4_dir_entry_2 **dest_de); int ext4_insert_dentry(struct inode *dir, - struct inode *inode, - struct ext4_dir_entry_2 *de, - int buf_size, - const struct qstr *iname, - const char *name, int namelen); + struct inode *inode, + struct ext4_dir_entry_2 *de, + int buf_size, + struct ext4_filename *fname); static inline void ext4_update_dx_flag(struct inode *inode) { if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb, @@ -2317,13 +2344,14 @@ extern int ext4_orphan_add(handle_t *, struct inode *); extern int ext4_orphan_del(handle_t *, struct inode *); extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, __u32 start_minor_hash, __u32 *next_hash); -extern int search_dir(struct buffer_head *bh, - char *search_buf, - int buf_size, - struct inode *dir, - const struct qstr *d_name, - unsigned int offset, - struct ext4_dir_entry_2 **res_dir); +extern int ext4_search_dir(struct buffer_head *bh, + char *search_buf, + int buf_size, + struct inode *dir, + struct ext4_filename *fname, + const struct qstr *d_name, + unsigned int offset, + struct ext4_dir_entry_2 **res_dir); extern int ext4_generic_delete_entry(handle_t *handle, struct inode *dir, struct ext4_dir_entry_2 *de_del, @@ -2368,6 +2396,9 @@ void __ext4_abort(struct super_block *, const char *, unsigned int, extern __printf(4, 5) void __ext4_warning(struct super_block *, const char *, unsigned int, const char *, ...); +extern __printf(4, 5) +void __ext4_warning_inode(const struct inode *inode, const char *function, + unsigned int line, const char *fmt, ...); extern __printf(3, 4) void __ext4_msg(struct super_block *, const char *, const char *, ...); extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp, @@ -2378,6 +2409,15 @@ void __ext4_grp_locked_error(const char *, unsigned int, unsigned long, ext4_fsblk_t, const char *, ...); +#define EXT4_ERROR_INODE(inode, fmt, a...) \ + ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a) + +#define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...) \ + ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a) + +#define EXT4_ERROR_FILE(file, block, fmt, a...) \ + ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a) + #ifdef CONFIG_PRINTK #define ext4_error_inode(inode, func, line, block, fmt, ...) \ @@ -2390,6 +2430,8 @@ void __ext4_grp_locked_error(const char *, unsigned int, __ext4_abort(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) #define ext4_warning(sb, fmt, ...) \ __ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) +#define ext4_warning_inode(inode, fmt, ...) \ + __ext4_warning_inode(inode, __func__, __LINE__, fmt, ##__VA_ARGS__) #define ext4_msg(sb, level, fmt, ...) \ __ext4_msg(sb, level, fmt, ##__VA_ARGS__) #define dump_mmp_msg(sb, mmp, msg) \ @@ -2425,6 +2467,11 @@ do { \ no_printk(fmt, ##__VA_ARGS__); \ __ext4_warning(sb, "", 0, " "); \ } while (0) +#define ext4_warning_inode(inode, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_warning_inode(inode, "", 0, " "); \ +} while (0) #define ext4_msg(sb, level, fmt, ...) \ do { \ no_printk(fmt, ##__VA_ARGS__); \ @@ -2768,7 +2815,9 @@ extern int ext4_da_write_inline_data_begin(struct address_space *mapping, extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied, struct page *page); -extern int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry, +extern int ext4_try_add_inline_entry(handle_t *handle, + struct ext4_filename *fname, + struct dentry *dentry, struct inode *inode); extern int ext4_try_create_inline_dir(handle_t *handle, struct inode *parent, @@ -2782,6 +2831,7 @@ extern int htree_inlinedir_to_tree(struct file *dir_file, __u32 start_hash, __u32 start_minor_hash, int *has_inline_data); extern struct buffer_head *ext4_find_inline_entry(struct inode *dir, + struct ext4_filename *fname, const struct qstr *d_name, struct ext4_dir_entry_2 **res_dir, int *has_inline_data); @@ -2913,6 +2963,7 @@ extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); extern int ext4_ext_precache(struct inode *inode); extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len); +extern int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len); extern int ext4_swap_extents(handle_t *handle, struct inode *inode1, struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2, ext4_lblk_t count, diff --git a/fs/ext4/ext4_crypto.h b/fs/ext4/ext4_crypto.h index d75159c..ac7d4e8 100644 --- a/fs/ext4/ext4_crypto.h +++ b/fs/ext4/ext4_crypto.h @@ -66,24 +66,39 @@ struct ext4_encryption_context { #define EXT4_KEY_DESC_PREFIX "ext4:" #define EXT4_KEY_DESC_PREFIX_SIZE 5 +/* This is passed in from userspace into the kernel keyring */ struct ext4_encryption_key { - uint32_t mode; - char raw[EXT4_MAX_KEY_SIZE]; - uint32_t size; + __u32 mode; + char raw[EXT4_MAX_KEY_SIZE]; + __u32 size; +} __attribute__((__packed__)); + +struct ext4_crypt_info { + char ci_data_mode; + char ci_filename_mode; + char ci_flags; + struct crypto_ablkcipher *ci_ctfm; + struct key *ci_keyring_key; + char ci_master_key[EXT4_KEY_DESCRIPTOR_SIZE]; }; #define EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001 -#define EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL 0x00000002 +#define EXT4_WRITE_PATH_FL 0x00000002 struct ext4_crypto_ctx { - struct crypto_tfm *tfm; /* Crypto API context */ - struct page *bounce_page; /* Ciphertext page on write path */ - struct page *control_page; /* Original page on write path */ - struct bio *bio; /* The bio for this context */ - struct work_struct work; /* Work queue for read complete path */ - struct list_head free_list; /* Free list */ - int flags; /* Flags */ - int mode; /* Encryption mode for tfm */ + union { + struct { + struct page *bounce_page; /* Ciphertext page */ + struct page *control_page; /* Original page */ + } w; + struct { + struct bio *bio; + struct work_struct work; + } r; + struct list_head free_list; /* Free list */ + }; + char flags; /* Flags */ + char mode; /* Encryption mode for tfm */ }; struct ext4_completion_result { @@ -121,18 +136,6 @@ struct ext4_str { u32 len; }; -struct ext4_fname_crypto_ctx { - u32 lim; - char tmp_buf[EXT4_CRYPTO_BLOCK_SIZE]; - struct crypto_ablkcipher *ctfm; - struct crypto_hash *htfm; - struct page *workpage; - struct ext4_encryption_key key; - unsigned flags : 8; - unsigned has_valid_key : 1; - unsigned ctfm_key_is_ready : 1; -}; - /** * For encrypted symlinks, the ciphertext length is stored at the beginning * of the string in little-endian format. diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index e003a1e..aadb728 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -39,6 +39,7 @@ #include <linux/slab.h> #include <asm/uaccess.h> #include <linux/fiemap.h> +#include <linux/backing-dev.h> #include "ext4_jbd2.h" #include "ext4_extents.h" #include "xattr.h" @@ -4456,6 +4457,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ar.flags |= EXT4_MB_HINT_NOPREALLOC; if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) ar.flags |= EXT4_MB_DELALLOC_RESERVED; + if (flags & EXT4_GET_BLOCKS_METADATA_NOFAIL) + ar.flags |= EXT4_MB_USE_RESERVED; newblock = ext4_mb_new_blocks(handle, &ar, &err); if (!newblock) goto out2; @@ -4663,6 +4666,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, int ret = 0; int ret2 = 0; int retries = 0; + int depth = 0; struct ext4_map_blocks map; unsigned int credits; loff_t epos; @@ -4677,13 +4681,32 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, if (len <= EXT_UNWRITTEN_MAX_LEN) flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; + /* Wait all existing dio workers, newcomers will block on i_mutex */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + /* * credits to insert 1 extent into extent tree */ credits = ext4_chunk_trans_blocks(inode, len); + /* + * We can only call ext_depth() on extent based inodes + */ + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + depth = ext_depth(inode); + else + depth = -1; retry: while (ret >= 0 && len) { + /* + * Recalculate credits when extent tree depth changes. + */ + if (depth >= 0 && depth != ext_depth(inode)) { + credits = ext4_chunk_trans_blocks(inode, len); + depth = ext_depth(inode); + } + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); if (IS_ERR(handle)) { @@ -4725,6 +4748,8 @@ retry: goto retry; } + ext4_inode_resume_unlocked_dio(inode); + return ret > 0 ? ret2 : ret; } @@ -4912,12 +4937,14 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) * bug we should fix.... */ if (ext4_encrypted_inode(inode) && - (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))) + (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE | + FALLOC_FL_ZERO_RANGE))) return -EOPNOTSUPP; /* Return error if mode is not supported */ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | - FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE)) + FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | + FALLOC_FL_INSERT_RANGE)) return -EOPNOTSUPP; if (mode & FALLOC_FL_PUNCH_HOLE) @@ -4930,6 +4957,9 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (mode & FALLOC_FL_COLLAPSE_RANGE) return ext4_collapse_range(inode, offset, len); + if (mode & FALLOC_FL_INSERT_RANGE) + return ext4_insert_range(inode, offset, len); + if (mode & FALLOC_FL_ZERO_RANGE) return ext4_zero_range(file, offset, len, mode); @@ -5224,13 +5254,13 @@ ext4_access_path(handle_t *handle, struct inode *inode, /* * ext4_ext_shift_path_extents: * Shift the extents of a path structure lying between path[depth].p_ext - * and EXT_LAST_EXTENT(path[depth].p_hdr) downwards, by subtracting shift - * from starting block for each extent. + * and EXT_LAST_EXTENT(path[depth].p_hdr), by @shift blocks. @SHIFT tells + * if it is right shift or left shift operation. */ static int ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, struct inode *inode, handle_t *handle, - ext4_lblk_t *start) + enum SHIFT_DIRECTION SHIFT) { int depth, err = 0; struct ext4_extent *ex_start, *ex_last; @@ -5252,19 +5282,25 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr)) update = 1; - *start = le32_to_cpu(ex_last->ee_block) + - ext4_ext_get_actual_len(ex_last); - while (ex_start <= ex_last) { - le32_add_cpu(&ex_start->ee_block, -shift); - /* Try to merge to the left. */ - if ((ex_start > - EXT_FIRST_EXTENT(path[depth].p_hdr)) && - ext4_ext_try_to_merge_right(inode, - path, ex_start - 1)) + if (SHIFT == SHIFT_LEFT) { + le32_add_cpu(&ex_start->ee_block, + -shift); + /* Try to merge to the left. */ + if ((ex_start > + EXT_FIRST_EXTENT(path[depth].p_hdr)) + && + ext4_ext_try_to_merge_right(inode, + path, ex_start - 1)) + ex_last--; + else + ex_start++; + } else { + le32_add_cpu(&ex_last->ee_block, shift); + ext4_ext_try_to_merge_right(inode, path, + ex_last); ex_last--; - else - ex_start++; + } } err = ext4_ext_dirty(handle, inode, path + depth); if (err) @@ -5279,7 +5315,10 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, if (err) goto out; - le32_add_cpu(&path[depth].p_idx->ei_block, -shift); + if (SHIFT == SHIFT_LEFT) + le32_add_cpu(&path[depth].p_idx->ei_block, -shift); + else + le32_add_cpu(&path[depth].p_idx->ei_block, shift); err = ext4_ext_dirty(handle, inode, path + depth); if (err) goto out; @@ -5297,19 +5336,20 @@ out: /* * ext4_ext_shift_extents: - * All the extents which lies in the range from start to the last allocated - * block for the file are shifted downwards by shift blocks. + * All the extents which lies in the range from @start to the last allocated + * block for the @inode are shifted either towards left or right (depending + * upon @SHIFT) by @shift blocks. * On success, 0 is returned, error otherwise. */ static int ext4_ext_shift_extents(struct inode *inode, handle_t *handle, - ext4_lblk_t start, ext4_lblk_t shift) + ext4_lblk_t start, ext4_lblk_t shift, + enum SHIFT_DIRECTION SHIFT) { struct ext4_ext_path *path; int ret = 0, depth; struct ext4_extent *extent; - ext4_lblk_t stop_block; - ext4_lblk_t ex_start, ex_end; + ext4_lblk_t stop, *iterator, ex_start, ex_end; /* Let path point to the last extent */ path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0); @@ -5321,58 +5361,84 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, if (!extent) goto out; - stop_block = le32_to_cpu(extent->ee_block) + + stop = le32_to_cpu(extent->ee_block) + ext4_ext_get_actual_len(extent); - /* Nothing to shift, if hole is at the end of file */ - if (start >= stop_block) - goto out; + /* + * In case of left shift, Don't start shifting extents until we make + * sure the hole is big enough to accommodate the shift. + */ + if (SHIFT == SHIFT_LEFT) { + path = ext4_find_extent(inode, start - 1, &path, 0); + if (IS_ERR(path)) + return PTR_ERR(path); + depth = path->p_depth; + extent = path[depth].p_ext; + if (extent) { + ex_start = le32_to_cpu(extent->ee_block); + ex_end = le32_to_cpu(extent->ee_block) + + ext4_ext_get_actual_len(extent); + } else { + ex_start = 0; + ex_end = 0; + } - /* - * Don't start shifting extents until we make sure the hole is big - * enough to accomodate the shift. - */ - path = ext4_find_extent(inode, start - 1, &path, 0); - if (IS_ERR(path)) - return PTR_ERR(path); - depth = path->p_depth; - extent = path[depth].p_ext; - if (extent) { - ex_start = le32_to_cpu(extent->ee_block); - ex_end = le32_to_cpu(extent->ee_block) + - ext4_ext_get_actual_len(extent); - } else { - ex_start = 0; - ex_end = 0; + if ((start == ex_start && shift > ex_start) || + (shift > start - ex_end)) { + ext4_ext_drop_refs(path); + kfree(path); + return -EINVAL; + } } - if ((start == ex_start && shift > ex_start) || - (shift > start - ex_end)) - return -EINVAL; + /* + * In case of left shift, iterator points to start and it is increased + * till we reach stop. In case of right shift, iterator points to stop + * and it is decreased till we reach start. + */ + if (SHIFT == SHIFT_LEFT) + iterator = &start; + else + iterator = &stop; /* Its safe to start updating extents */ - while (start < stop_block) { - path = ext4_find_extent(inode, start, &path, 0); + while (start < stop) { + path = ext4_find_extent(inode, *iterator, &path, 0); if (IS_ERR(path)) return PTR_ERR(path); depth = path->p_depth; extent = path[depth].p_ext; if (!extent) { EXT4_ERROR_INODE(inode, "unexpected hole at %lu", - (unsigned long) start); + (unsigned long) *iterator); return -EIO; } - if (start > le32_to_cpu(extent->ee_block)) { + if (SHIFT == SHIFT_LEFT && *iterator > + le32_to_cpu(extent->ee_block)) { /* Hole, move to the next extent */ if (extent < EXT_LAST_EXTENT(path[depth].p_hdr)) { path[depth].p_ext++; } else { - start = ext4_ext_next_allocated_block(path); + *iterator = ext4_ext_next_allocated_block(path); continue; } } + + if (SHIFT == SHIFT_LEFT) { + extent = EXT_LAST_EXTENT(path[depth].p_hdr); + *iterator = le32_to_cpu(extent->ee_block) + + ext4_ext_get_actual_len(extent); + } else { + extent = EXT_FIRST_EXTENT(path[depth].p_hdr); + *iterator = le32_to_cpu(extent->ee_block) > 0 ? + le32_to_cpu(extent->ee_block) - 1 : 0; + /* Update path extent in case we need to stop */ + while (le32_to_cpu(extent->ee_block) < start) + extent++; + path[depth].p_ext = extent; + } ret = ext4_ext_shift_path_extents(path, shift, inode, - handle, &start); + handle, SHIFT); if (ret) break; } @@ -5485,7 +5551,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) ext4_discard_preallocations(inode); ret = ext4_ext_shift_extents(inode, handle, punch_stop, - punch_stop - punch_start); + punch_stop - punch_start, SHIFT_LEFT); if (ret) { up_write(&EXT4_I(inode)->i_data_sem); goto out_stop; @@ -5510,6 +5576,174 @@ out_mutex: return ret; } +/* + * ext4_insert_range: + * This function implements the FALLOC_FL_INSERT_RANGE flag of fallocate. + * The data blocks starting from @offset to the EOF are shifted by @len + * towards right to create a hole in the @inode. Inode size is increased + * by len bytes. + * Returns 0 on success, error otherwise. + */ +int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) +{ + struct super_block *sb = inode->i_sb; + handle_t *handle; + struct ext4_ext_path *path; + struct ext4_extent *extent; + ext4_lblk_t offset_lblk, len_lblk, ee_start_lblk = 0; + unsigned int credits, ee_len; + int ret = 0, depth, split_flag = 0; + loff_t ioffset; + + /* + * We need to test this early because xfstests assumes that an + * insert range of (0, 1) will return EOPNOTSUPP if the file + * system does not support insert range. + */ + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + return -EOPNOTSUPP; + + /* Insert range works only on fs block size aligned offsets. */ + if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) || + len & (EXT4_CLUSTER_SIZE(sb) - 1)) + return -EINVAL; + + if (!S_ISREG(inode->i_mode)) + return -EOPNOTSUPP; + + trace_ext4_insert_range(inode, offset, len); + + offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb); + len_lblk = len >> EXT4_BLOCK_SIZE_BITS(sb); + + /* Call ext4_force_commit to flush all data in case of data=journal */ + if (ext4_should_journal_data(inode)) { + ret = ext4_force_commit(inode->i_sb); + if (ret) + return ret; + } + + /* + * Need to round down to align start offset to page size boundary + * for page size > block size. + */ + ioffset = round_down(offset, PAGE_SIZE); + + /* Write out all dirty pages */ + ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, + LLONG_MAX); + if (ret) + return ret; + + /* Take mutex lock */ + mutex_lock(&inode->i_mutex); + + /* Currently just for extent based files */ + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + ret = -EOPNOTSUPP; + goto out_mutex; + } + + /* Check for wrap through zero */ + if (inode->i_size + len > inode->i_sb->s_maxbytes) { + ret = -EFBIG; + goto out_mutex; + } + + /* Offset should be less than i_size */ + if (offset >= i_size_read(inode)) { + ret = -EINVAL; + goto out_mutex; + } + + truncate_pagecache(inode, ioffset); + + /* Wait for existing dio to complete */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + + credits = ext4_writepage_trans_blocks(inode); + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out_dio; + } + + /* Expand file to avoid data loss if there is error while shifting */ + inode->i_size += len; + EXT4_I(inode)->i_disksize += len; + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + ret = ext4_mark_inode_dirty(handle, inode); + if (ret) + goto out_stop; + + down_write(&EXT4_I(inode)->i_data_sem); + ext4_discard_preallocations(inode); + + path = ext4_find_extent(inode, offset_lblk, NULL, 0); + if (IS_ERR(path)) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_stop; + } + + depth = ext_depth(inode); + extent = path[depth].p_ext; + if (extent) { + ee_start_lblk = le32_to_cpu(extent->ee_block); + ee_len = ext4_ext_get_actual_len(extent); + + /* + * If offset_lblk is not the starting block of extent, split + * the extent @offset_lblk + */ + if ((offset_lblk > ee_start_lblk) && + (offset_lblk < (ee_start_lblk + ee_len))) { + if (ext4_ext_is_unwritten(extent)) + split_flag = EXT4_EXT_MARK_UNWRIT1 | + EXT4_EXT_MARK_UNWRIT2; + ret = ext4_split_extent_at(handle, inode, &path, + offset_lblk, split_flag, + EXT4_EX_NOCACHE | + EXT4_GET_BLOCKS_PRE_IO | + EXT4_GET_BLOCKS_METADATA_NOFAIL); + } + + ext4_ext_drop_refs(path); + kfree(path); + if (ret < 0) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_stop; + } + } + + ret = ext4_es_remove_extent(inode, offset_lblk, + EXT_MAX_BLOCKS - offset_lblk); + if (ret) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_stop; + } + + /* + * if offset_lblk lies in a hole which is at start of file, use + * ee_start_lblk to shift extents + */ + ret = ext4_ext_shift_extents(inode, handle, + ee_start_lblk > offset_lblk ? ee_start_lblk : offset_lblk, + len_lblk, SHIFT_RIGHT); + + up_write(&EXT4_I(inode)->i_data_sem); + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + +out_stop: + ext4_journal_stop(handle); +out_dio: + ext4_inode_resume_unlocked_dio(inode); +out_mutex: + mutex_unlock(&inode->i_mutex); + return ret; +} + /** * ext4_swap_extents - Swap extents between two inodes * @@ -5542,7 +5776,7 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem)); BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem)); BUG_ON(!mutex_is_locked(&inode1->i_mutex)); - BUG_ON(!mutex_is_locked(&inode1->i_mutex)); + BUG_ON(!mutex_is_locked(&inode2->i_mutex)); *erp = ext4_es_remove_extent(inode1, lblk1, count); if (unlikely(*erp)) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 0613c25..ac517f1 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -223,9 +223,11 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) struct inode *inode = file->f_mapping->host; if (ext4_encrypted_inode(inode)) { - int err = ext4_generate_encryption_key(inode); + int err = ext4_get_encryption_info(inode); if (err) return 0; + if (ext4_encryption_info(inode) == NULL) + return -ENOKEY; } file_accessed(file); if (IS_DAX(file_inode(file))) { @@ -278,6 +280,13 @@ static int ext4_file_open(struct inode * inode, struct file * filp) ext4_journal_stop(handle); } } + if (ext4_encrypted_inode(inode)) { + ret = ext4_get_encryption_info(inode); + if (ret) + return -EACCES; + if (ext4_encryption_info(inode) == NULL) + return -ENOKEY; + } /* * Set up the jbd2_inode if we are opening the inode for * writing and the journal is present @@ -287,13 +296,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp) if (ret < 0) return ret; } - ret = dquot_file_open(inode, filp); - if (!ret && ext4_encrypted_inode(inode)) { - ret = ext4_generate_encryption_key(inode); - if (ret) - ret = -EACCES; - } - return ret; + return dquot_file_open(inode, filp); } /* diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 1eaa6cb..173c1ae 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -726,11 +726,25 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, ext4_group_t i; ext4_group_t flex_group; struct ext4_group_info *grp; + int encrypt = 0; /* Cannot create files in a deleted directory */ if (!dir || !dir->i_nlink) return ERR_PTR(-EPERM); + if ((ext4_encrypted_inode(dir) || + DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb))) && + (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) { + err = ext4_get_encryption_info(dir); + if (err) + return ERR_PTR(err); + if (ext4_encryption_info(dir) == NULL) + return ERR_PTR(-EPERM); + if (!handle) + nblocks += EXT4_DATA_TRANS_BLOCKS(dir->i_sb); + encrypt = 1; + } + sb = dir->i_sb; ngroups = ext4_get_groups_count(sb); trace_ext4_request_inode(dir, mode); @@ -996,12 +1010,6 @@ got: ei->i_block_group = group; ei->i_last_alloc_group = ~0; - /* If the directory encrypted, then we should encrypt the inode. */ - if ((S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) && - (ext4_encrypted_inode(dir) || - DUMMY_ENCRYPTION_ENABLED(sbi))) - ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); - ext4_set_inode_flags(inode); if (IS_DIRSYNC(inode)) ext4_handle_sync(handle); @@ -1034,28 +1042,9 @@ got: ext4_set_inode_state(inode, EXT4_STATE_NEW); ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize; -#ifdef CONFIG_EXT4_FS_ENCRYPTION - if ((sbi->s_file_encryption_mode == EXT4_ENCRYPTION_MODE_INVALID) && - (sbi->s_dir_encryption_mode == EXT4_ENCRYPTION_MODE_INVALID)) { - ei->i_inline_off = 0; - if (EXT4_HAS_INCOMPAT_FEATURE(sb, - EXT4_FEATURE_INCOMPAT_INLINE_DATA)) - ext4_set_inode_state(inode, - EXT4_STATE_MAY_INLINE_DATA); - } else { - /* Inline data and encryption are incompatible - * We turn off inline data since encryption is enabled */ - ei->i_inline_off = 1; - if (EXT4_HAS_INCOMPAT_FEATURE(sb, - EXT4_FEATURE_INCOMPAT_INLINE_DATA)) - ext4_clear_inode_state(inode, - EXT4_STATE_MAY_INLINE_DATA); - } -#else ei->i_inline_off = 0; if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_INLINE_DATA)) ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); -#endif ret = inode; err = dquot_alloc_inode(inode); if (err) @@ -1082,6 +1071,12 @@ got: ei->i_datasync_tid = handle->h_transaction->t_tid; } + if (encrypt) { + err = ext4_inherit_context(dir, inode); + if (err) + goto fail_free_drop; + } + err = ext4_mark_inode_dirty(handle, inode); if (err) { ext4_std_error(sb, err); diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 9588240..4f6ac49 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -565,7 +565,7 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { EXT4_ERROR_INODE(inode, "Can't allocate blocks for " "non-extent mapped inodes with bigalloc"); - return -ENOSPC; + return -EUCLEAN; } /* Set up for the direct block allocation */ @@ -576,6 +576,8 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, ar.flags = EXT4_MB_HINT_DATA; if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) ar.flags |= EXT4_MB_DELALLOC_RESERVED; + if (flags & EXT4_GET_BLOCKS_METADATA_NOFAIL) + ar.flags |= EXT4_MB_USE_RESERVED; ar.goal = ext4_find_goal(inode, map->m_lblk, partial); diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 095c7a2..cd944a7 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -995,20 +995,18 @@ void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh, * and -EEXIST if directory entry already exists. */ static int ext4_add_dirent_to_inline(handle_t *handle, + struct ext4_filename *fname, struct dentry *dentry, struct inode *inode, struct ext4_iloc *iloc, void *inline_start, int inline_size) { struct inode *dir = d_inode(dentry->d_parent); - const char *name = dentry->d_name.name; - int namelen = dentry->d_name.len; int err; struct ext4_dir_entry_2 *de; - err = ext4_find_dest_de(dir, inode, iloc->bh, - inline_start, inline_size, - name, namelen, &de); + err = ext4_find_dest_de(dir, inode, iloc->bh, inline_start, + inline_size, fname, &de); if (err) return err; @@ -1016,8 +1014,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle, err = ext4_journal_get_write_access(handle, iloc->bh); if (err) return err; - ext4_insert_dentry(dir, inode, de, inline_size, &dentry->d_name, - name, namelen); + ext4_insert_dentry(dir, inode, de, inline_size, fname); ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size); @@ -1248,8 +1245,8 @@ out: * If succeeds, return 0. If not, extended the inline dir and copied data to * the new created block. */ -int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry, - struct inode *inode) +int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, + struct dentry *dentry, struct inode *inode) { int ret, inline_size; void *inline_start; @@ -1268,7 +1265,7 @@ int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry, EXT4_INLINE_DOTDOT_SIZE; inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE; - ret = ext4_add_dirent_to_inline(handle, dentry, inode, &iloc, + ret = ext4_add_dirent_to_inline(handle, fname, dentry, inode, &iloc, inline_start, inline_size); if (ret != -ENOSPC) goto out; @@ -1289,8 +1286,9 @@ int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry, if (inline_size) { inline_start = ext4_get_inline_xattr_pos(dir, &iloc); - ret = ext4_add_dirent_to_inline(handle, dentry, inode, &iloc, - inline_start, inline_size); + ret = ext4_add_dirent_to_inline(handle, fname, dentry, + inode, &iloc, inline_start, + inline_size); if (ret != -ENOSPC) goto out; @@ -1611,6 +1609,7 @@ out: } struct buffer_head *ext4_find_inline_entry(struct inode *dir, + struct ext4_filename *fname, const struct qstr *d_name, struct ext4_dir_entry_2 **res_dir, int *has_inline_data) @@ -1632,8 +1631,8 @@ struct buffer_head *ext4_find_inline_entry(struct inode *dir, inline_start = (void *)ext4_raw_inode(&iloc)->i_block + EXT4_INLINE_DOTDOT_SIZE; inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE; - ret = search_dir(iloc.bh, inline_start, inline_size, - dir, d_name, 0, res_dir); + ret = ext4_search_dir(iloc.bh, inline_start, inline_size, + dir, fname, d_name, 0, res_dir); if (ret == 1) goto out_find; if (ret < 0) @@ -1645,8 +1644,8 @@ struct buffer_head *ext4_find_inline_entry(struct inode *dir, inline_start = ext4_get_inline_xattr_pos(dir, &iloc); inline_size = ext4_get_inline_size(dir) - EXT4_MIN_INLINE_DATA_SIZE; - ret = search_dir(iloc.bh, inline_start, inline_size, - dir, d_name, 0, res_dir); + ret = ext4_search_dir(iloc.bh, inline_start, inline_size, + dir, fname, d_name, 0, res_dir); if (ret == 1) goto out_find; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5168c9b..f8a8d4e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -731,18 +731,18 @@ int ext4_get_block(struct inode *inode, sector_t iblock, * `handle' can be NULL if create is zero */ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, - ext4_lblk_t block, int create) + ext4_lblk_t block, int map_flags) { struct ext4_map_blocks map; struct buffer_head *bh; + int create = map_flags & EXT4_GET_BLOCKS_CREATE; int err; J_ASSERT(handle != NULL || create == 0); map.m_lblk = block; map.m_len = 1; - err = ext4_map_blocks(handle, inode, &map, - create ? EXT4_GET_BLOCKS_CREATE : 0); + err = ext4_map_blocks(handle, inode, &map, map_flags); if (err == 0) return create ? ERR_PTR(-ENOSPC) : NULL; @@ -788,11 +788,11 @@ errout: } struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, - ext4_lblk_t block, int create) + ext4_lblk_t block, int map_flags) { struct buffer_head *bh; - bh = ext4_getblk(handle, inode, block, create); + bh = ext4_getblk(handle, inode, block, map_flags); if (IS_ERR(bh)) return bh; if (!bh || buffer_uptodate(bh)) @@ -1261,13 +1261,12 @@ static int ext4_journalled_write_end(struct file *file, } /* - * Reserve a single cluster located at lblock + * Reserve space for a single cluster */ -static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) +static int ext4_da_reserve_space(struct inode *inode) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); - unsigned int md_needed; int ret; /* @@ -1279,25 +1278,14 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) if (ret) return ret; - /* - * recalculate the amount of metadata blocks to reserve - * in order to allocate nrblocks - * worse case is one extent per block - */ spin_lock(&ei->i_block_reservation_lock); - /* - * ext4_calc_metadata_amount() has side effects, which we have - * to be prepared undo if we fail to claim space. - */ - md_needed = 0; - trace_ext4_da_reserve_space(inode, 0); - if (ext4_claim_free_clusters(sbi, 1, 0)) { spin_unlock(&ei->i_block_reservation_lock); dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1)); return -ENOSPC; } ei->i_reserved_data_blocks++; + trace_ext4_da_reserve_space(inode); spin_unlock(&ei->i_block_reservation_lock); return 0; /* success */ @@ -1566,9 +1554,9 @@ add_delayed: * then we don't need to reserve it again. However we still need * to reserve metadata for every block we're going to write. */ - if (EXT4_SB(inode->i_sb)->s_cluster_ratio <= 1 || + if (EXT4_SB(inode->i_sb)->s_cluster_ratio == 1 || !ext4_find_delalloc_cluster(inode, map->m_lblk)) { - ret = ext4_da_reserve_space(inode, iblock); + ret = ext4_da_reserve_space(inode); if (ret) { /* not enough space to reserve */ retval = ret; @@ -1701,19 +1689,32 @@ static int __ext4_journalled_writepage(struct page *page, ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one); } - /* As soon as we unlock the page, it can go away, but we have - * references to buffers so we are safe */ + /* + * We need to release the page lock before we start the + * journal, so grab a reference so the page won't disappear + * out from under us. + */ + get_page(page); unlock_page(page); handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, ext4_writepage_trans_blocks(inode)); if (IS_ERR(handle)) { ret = PTR_ERR(handle); - goto out; + put_page(page); + goto out_no_pagelock; } - BUG_ON(!ext4_handle_valid(handle)); + lock_page(page); + put_page(page); + if (page->mapping != mapping) { + /* The page got truncated from under us */ + ext4_journal_stop(handle); + ret = 0; + goto out; + } + if (inline_data) { BUFFER_TRACE(inode_bh, "get write access"); ret = ext4_journal_get_write_access(handle, inode_bh); @@ -1739,6 +1740,8 @@ static int __ext4_journalled_writepage(struct page *page, NULL, bput_one); ext4_set_inode_state(inode, EXT4_STATE_JDATA); out: + unlock_page(page); +out_no_pagelock: brelse(inode_bh); return ret; } @@ -4681,8 +4684,10 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) ext4_journal_stop(handle); } - if (attr->ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) { + if (attr->ia_valid & ATTR_SIZE) { handle_t *handle; + loff_t oldsize = inode->i_size; + int shrink = (attr->ia_size <= inode->i_size); if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); @@ -4690,24 +4695,26 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_size > sbi->s_bitmap_maxbytes) return -EFBIG; } + if (!S_ISREG(inode->i_mode)) + return -EINVAL; if (IS_I_VERSION(inode) && attr->ia_size != inode->i_size) inode_inc_iversion(inode); - if (S_ISREG(inode->i_mode) && + if (ext4_should_order_data(inode) && (attr->ia_size < inode->i_size)) { - if (ext4_should_order_data(inode)) { - error = ext4_begin_ordered_truncate(inode, + error = ext4_begin_ordered_truncate(inode, attr->ia_size); - if (error) - goto err_out; - } + if (error) + goto err_out; + } + if (attr->ia_size != inode->i_size) { handle = ext4_journal_start(inode, EXT4_HT_INODE, 3); if (IS_ERR(handle)) { error = PTR_ERR(handle); goto err_out; } - if (ext4_handle_valid(handle)) { + if (ext4_handle_valid(handle) && shrink) { error = ext4_orphan_add(handle, inode); orphan = 1; } @@ -4726,15 +4733,13 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) up_write(&EXT4_I(inode)->i_data_sem); ext4_journal_stop(handle); if (error) { - ext4_orphan_del(NULL, inode); + if (orphan) + ext4_orphan_del(NULL, inode); goto err_out; } - } else { - loff_t oldsize = inode->i_size; - - i_size_write(inode, attr->ia_size); - pagecache_isize_extended(inode, oldsize, inode->i_size); } + if (!shrink) + pagecache_isize_extended(inode, oldsize, inode->i_size); /* * Blocks are going to be removed from the inode. Wait @@ -4754,13 +4759,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) * in data=journal mode to make pages freeable. */ truncate_pagecache(inode, inode->i_size); + if (shrink) + ext4_truncate(inode); } - /* - * We want to call ext4_truncate() even if attr->ia_size == - * inode->i_size for cases like truncation of fallocated space - */ - if (attr->ia_valid & ATTR_SIZE) - ext4_truncate(inode); if (!rc) { setattr_copy(inode, attr); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 2cb9e17..cb84512 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -31,14 +31,11 @@ static void memswap(void *a, void *b, size_t len) { unsigned char *ap, *bp; - unsigned char tmp; ap = (unsigned char *)a; bp = (unsigned char *)b; while (len-- > 0) { - tmp = *ap; - *ap = *bp; - *bp = tmp; + swap(*ap, *bp); ap++; bp++; } @@ -675,8 +672,8 @@ encryption_policy_out: if (err) return err; } - if (copy_to_user((void *) arg, sbi->s_es->s_encrypt_pw_salt, - 16)) + if (copy_to_user((void __user *) arg, + sbi->s_es->s_encrypt_pw_salt, 16)) return -EFAULT; return 0; } @@ -690,7 +687,7 @@ encryption_policy_out: err = ext4_get_policy(inode, &policy); if (err) return err; - if (copy_to_user((void *)arg, &policy, sizeof(policy))) + if (copy_to_user((void __user *)arg, &policy, sizeof(policy))) return -EFAULT; return 0; #else diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 8d1e602..f6aedf8 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -26,6 +26,7 @@ #include <linux/log2.h> #include <linux/module.h> #include <linux/slab.h> +#include <linux/backing-dev.h> #include <trace/events/ext4.h> #ifdef CONFIG_EXT4_DEBUG @@ -882,10 +883,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore) /* wait for I/O completion */ for (i = 0, group = first_group; i < groups_per_page; i++, group++) { - if (bh[i] && ext4_wait_block_bitmap(sb, group, bh[i])) { + if (bh[i] && ext4_wait_block_bitmap(sb, group, bh[i])) err = -EIO; - goto out; - } } first_block = page->index * blocks_per_page; @@ -898,6 +897,11 @@ static int ext4_mb_init_cache(struct page *page, char *incore) /* skip initialized uptodate buddy */ continue; + if (!buffer_verified(bh[group - first_group])) + /* Skip faulty bitmaps */ + continue; + err = 0; + /* * data carry information regarding this * particular group in the format specified @@ -2008,7 +2012,12 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, } } -/* This is now called BEFORE we load the buddy bitmap. */ +/* + * This is now called BEFORE we load the buddy bitmap. + * Returns either 1 or 0 indicating that the group is either suitable + * for the allocation or not. In addition it can also return negative + * error code when something goes wrong. + */ static int ext4_mb_good_group(struct ext4_allocation_context *ac, ext4_group_t group, int cr) { @@ -2031,7 +2040,7 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac, if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { int ret = ext4_mb_init_group(ac->ac_sb, group); if (ret) - return 0; + return ret; } fragments = grp->bb_fragments; @@ -2078,7 +2087,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) { ext4_group_t ngroups, group, i; int cr; - int err = 0; + int err = 0, first_err = 0; struct ext4_sb_info *sbi; struct super_block *sb; struct ext4_buddy e4b; @@ -2145,6 +2154,7 @@ repeat: group = ac->ac_g_ex.fe_group; for (i = 0; i < ngroups; group++, i++) { + int ret = 0; cond_resched(); /* * Artificially restricted ngroups for non-extent @@ -2154,8 +2164,12 @@ repeat: group = 0; /* This now checks without needing the buddy page */ - if (!ext4_mb_good_group(ac, group, cr)) + ret = ext4_mb_good_group(ac, group, cr); + if (ret <= 0) { + if (!first_err) + first_err = ret; continue; + } err = ext4_mb_load_buddy(sb, group, &e4b); if (err) @@ -2167,9 +2181,12 @@ repeat: * We need to check again after locking the * block group */ - if (!ext4_mb_good_group(ac, group, cr)) { + ret = ext4_mb_good_group(ac, group, cr); + if (ret <= 0) { ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); + if (!first_err) + first_err = ret; continue; } @@ -2216,6 +2233,8 @@ repeat: } } out: + if (!err && ac->ac_status != AC_STATUS_FOUND && first_err) + err = first_err; return err; } @@ -2257,12 +2276,9 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) group--; if (group == 0) - seq_printf(seq, "#%-5s: %-5s %-5s %-5s " - "[ %-5s %-5s %-5s %-5s %-5s %-5s %-5s " - "%-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n", - "group", "free", "frags", "first", - "2^0", "2^1", "2^2", "2^3", "2^4", "2^5", "2^6", - "2^7", "2^8", "2^9", "2^10", "2^11", "2^12", "2^13"); + seq_puts(seq, "#group: free frags first [" + " 2^0 2^1 2^2 2^3 2^4 2^5 2^6 " + " 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]"); i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + sizeof(struct ext4_group_info); diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 370420b..fb6f117 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -166,12 +166,9 @@ mext_page_double_lock(struct inode *inode1, struct inode *inode2, */ wait_on_page_writeback(page[0]); wait_on_page_writeback(page[1]); - if (inode1 > inode2) { - struct page *tmp; - tmp = page[0]; - page[0] = page[1]; - page[1] = tmp; - } + if (inode1 > inode2) + swap(page[0], page[1]); + return 0; } @@ -574,12 +571,16 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } - /* TODO: This is non obvious task to swap blocks for inodes with full - jornaling enabled */ + + /* TODO: it's not obvious how to swap blocks for inodes with full + journaling enabled */ if (ext4_should_journal_data(orig_inode) || ext4_should_journal_data(donor_inode)) { - return -EINVAL; + ext4_msg(orig_inode->i_sb, KERN_ERR, + "Online defrag not supported with data journaling"); + return -EOPNOTSUPP; } + /* Protect orig and donor inodes against a truncate */ lock_two_nondirectories(orig_inode, donor_inode); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 5fdb9f6a..011dcfb 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -61,7 +61,7 @@ static struct buffer_head *ext4_append(handle_t *handle, *block = inode->i_size >> inode->i_sb->s_blocksize_bits; - bh = ext4_bread(handle, inode, *block, 1); + bh = ext4_bread(handle, inode, *block, EXT4_GET_BLOCKS_CREATE); if (IS_ERR(bh)) return bh; inode->i_size += inode->i_sb->s_blocksize; @@ -84,12 +84,13 @@ typedef enum { } dirblock_type_t; #define ext4_read_dirblock(inode, block, type) \ - __ext4_read_dirblock((inode), (block), (type), __LINE__) + __ext4_read_dirblock((inode), (block), (type), __func__, __LINE__) static struct buffer_head *__ext4_read_dirblock(struct inode *inode, - ext4_lblk_t block, - dirblock_type_t type, - unsigned int line) + ext4_lblk_t block, + dirblock_type_t type, + const char *func, + unsigned int line) { struct buffer_head *bh; struct ext4_dir_entry *dirent; @@ -97,15 +98,17 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, bh = ext4_bread(NULL, inode, block, 0); if (IS_ERR(bh)) { - __ext4_warning(inode->i_sb, __func__, line, - "error %ld reading directory block " - "(ino %lu, block %lu)", PTR_ERR(bh), inode->i_ino, - (unsigned long) block); + __ext4_warning(inode->i_sb, func, line, + "inode #%lu: lblock %lu: comm %s: " + "error %ld reading directory block", + inode->i_ino, (unsigned long)block, + current->comm, PTR_ERR(bh)); return bh; } if (!bh) { - ext4_error_inode(inode, __func__, line, block, "Directory hole found"); + ext4_error_inode(inode, func, line, block, + "Directory hole found"); return ERR_PTR(-EIO); } dirent = (struct ext4_dir_entry *) bh->b_data; @@ -119,7 +122,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, is_dx_block = 1; } if (!is_dx_block && type == INDEX) { - ext4_error_inode(inode, __func__, line, block, + ext4_error_inode(inode, func, line, block, "directory leaf block found instead of index block"); return ERR_PTR(-EIO); } @@ -136,8 +139,8 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, if (ext4_dx_csum_verify(inode, dirent)) set_buffer_verified(bh); else { - ext4_error_inode(inode, __func__, line, block, - "Directory index failed checksum"); + ext4_error_inode(inode, func, line, block, + "Directory index failed checksum"); brelse(bh); return ERR_PTR(-EIO); } @@ -146,8 +149,8 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, if (ext4_dirent_csum_verify(inode, dirent)) set_buffer_verified(bh); else { - ext4_error_inode(inode, __func__, line, block, - "Directory block failed checksum"); + ext4_error_inode(inode, func, line, block, + "Directory block failed checksum"); brelse(bh); return ERR_PTR(-EIO); } @@ -248,7 +251,7 @@ static void dx_set_count(struct dx_entry *entries, unsigned value); static void dx_set_limit(struct dx_entry *entries, unsigned value); static unsigned dx_root_limit(struct inode *dir, unsigned infosize); static unsigned dx_node_limit(struct inode *dir); -static struct dx_frame *dx_probe(const struct qstr *d_name, +static struct dx_frame *dx_probe(struct ext4_filename *fname, struct inode *dir, struct dx_hash_info *hinfo, struct dx_frame *frame); @@ -267,10 +270,10 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash, struct dx_frame *frames, __u32 *start_hash); static struct buffer_head * ext4_dx_find_entry(struct inode *dir, - const struct qstr *d_name, + struct ext4_filename *fname, struct ext4_dir_entry_2 **res_dir); -static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, - struct inode *inode); +static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, + struct dentry *dentry, struct inode *inode); /* checksumming functions */ void initialize_dirent_tail(struct ext4_dir_entry_tail *t, @@ -327,10 +330,14 @@ static __le32 ext4_dirent_csum(struct inode *inode, return cpu_to_le32(csum); } -static void warn_no_space_for_csum(struct inode *inode) +#define warn_no_space_for_csum(inode) \ + __warn_no_space_for_csum((inode), __func__, __LINE__) + +static void __warn_no_space_for_csum(struct inode *inode, const char *func, + unsigned int line) { - ext4_warning(inode->i_sb, "no space in directory inode %lu leaf for " - "checksum. Please run e2fsck -D.", inode->i_ino); + __ext4_warning_inode(inode, func, line, + "No space for directory leaf checksum. Please run e2fsck -D."); } int ext4_dirent_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent) @@ -607,17 +614,15 @@ static struct stats dx_show_leaf(struct inode *dir, char *name; struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}; - struct ext4_fname_crypto_ctx *ctx = NULL; - int res; + int res = 0; name = de->name; len = de->name_len; - ctx = ext4_get_fname_crypto_ctx(dir, - EXT4_NAME_LEN); - if (IS_ERR(ctx)) { - printk(KERN_WARNING "Error acquiring" - " crypto ctxt--skipping crypto\n"); - ctx = NULL; + if (ext4_encrypted_inode(inode)) + res = ext4_get_encryption_info(dir); + if (res) { + printk(KERN_WARNING "Error setting up" + " fname crypto: %d\n", res); } if (ctx == NULL) { /* Directory is not encrypted */ @@ -637,7 +642,6 @@ static struct stats dx_show_leaf(struct inode *dir, "allocating crypto " "buffer--skipping " "crypto\n"); - ext4_put_fname_crypto_ctx(&ctx); ctx = NULL; } res = ext4_fname_disk_to_usr(ctx, NULL, de, @@ -658,7 +662,6 @@ static struct stats dx_show_leaf(struct inode *dir, printk("%*.s:(E)%x.%u ", len, name, h.hash, (unsigned) ((char *) de - base)); - ext4_put_fname_crypto_ctx(&ctx); ext4_fname_crypto_free_buffer( &fname_crypto_str); } @@ -724,7 +727,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, * back to userspace. */ static struct dx_frame * -dx_probe(const struct qstr *d_name, struct inode *dir, +dx_probe(struct ext4_filename *fname, struct inode *dir, struct dx_hash_info *hinfo, struct dx_frame *frame_in) { unsigned count, indirect; @@ -742,56 +745,41 @@ dx_probe(const struct qstr *d_name, struct inode *dir, if (root->info.hash_version != DX_HASH_TEA && root->info.hash_version != DX_HASH_HALF_MD4 && root->info.hash_version != DX_HASH_LEGACY) { - ext4_warning(dir->i_sb, "Unrecognised inode hash code %d", - root->info.hash_version); + ext4_warning_inode(dir, "Unrecognised inode hash code %u", + root->info.hash_version); goto fail; } + if (fname) + hinfo = &fname->hinfo; hinfo->hash_version = root->info.hash_version; if (hinfo->hash_version <= DX_HASH_TEA) hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed; -#ifdef CONFIG_EXT4_FS_ENCRYPTION - if (d_name) { - struct ext4_fname_crypto_ctx *ctx = NULL; - int res; - - /* Check if the directory is encrypted */ - ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN); - if (IS_ERR(ctx)) { - ret_err = ERR_PTR(PTR_ERR(ctx)); - goto fail; - } - res = ext4_fname_usr_to_hash(ctx, d_name, hinfo); - if (res < 0) { - ret_err = ERR_PTR(res); - goto fail; - } - ext4_put_fname_crypto_ctx(&ctx); - } -#else - if (d_name) - ext4fs_dirhash(d_name->name, d_name->len, hinfo); -#endif + if (fname && fname_name(fname)) + ext4fs_dirhash(fname_name(fname), fname_len(fname), hinfo); hash = hinfo->hash; if (root->info.unused_flags & 1) { - ext4_warning(dir->i_sb, "Unimplemented inode hash flags: %#06x", - root->info.unused_flags); + ext4_warning_inode(dir, "Unimplemented hash flags: %#06x", + root->info.unused_flags); goto fail; } - if ((indirect = root->info.indirect_levels) > 1) { - ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x", - root->info.indirect_levels); + indirect = root->info.indirect_levels; + if (indirect > 1) { + ext4_warning_inode(dir, "Unimplemented hash depth: %#06x", + root->info.indirect_levels); goto fail; } - entries = (struct dx_entry *) (((char *)&root->info) + - root->info.info_length); + entries = (struct dx_entry *)(((char *)&root->info) + + root->info.info_length); if (dx_get_limit(entries) != dx_root_limit(dir, root->info.info_length)) { - ext4_warning(dir->i_sb, "dx entry: limit != root limit"); + ext4_warning_inode(dir, "dx entry: limit %u != root limit %u", + dx_get_limit(entries), + dx_root_limit(dir, root->info.info_length)); goto fail; } @@ -799,15 +787,16 @@ dx_probe(const struct qstr *d_name, struct inode *dir, while (1) { count = dx_get_count(entries); if (!count || count > dx_get_limit(entries)) { - ext4_warning(dir->i_sb, - "dx entry: no count or count > limit"); + ext4_warning_inode(dir, + "dx entry: count %u beyond limit %u", + count, dx_get_limit(entries)); goto fail; } p = entries + 1; q = entries + count - 1; while (p <= q) { - m = p + (q - p)/2; + m = p + (q - p) / 2; dxtrace(printk(".")); if (dx_get_hash(m) > hash) q = m - 1; @@ -831,7 +820,8 @@ dx_probe(const struct qstr *d_name, struct inode *dir, } at = p - 1; - dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at))); + dxtrace(printk(" %x->%u\n", at == entries ? 0 : dx_get_hash(at), + dx_get_block(at))); frame->entries = entries; frame->at = at; if (!indirect--) @@ -845,9 +835,10 @@ dx_probe(const struct qstr *d_name, struct inode *dir, } entries = ((struct dx_node *) frame->bh->b_data)->entries; - if (dx_get_limit(entries) != dx_node_limit (dir)) { - ext4_warning(dir->i_sb, - "dx entry: limit != node limit"); + if (dx_get_limit(entries) != dx_node_limit(dir)) { + ext4_warning_inode(dir, + "dx entry: limit %u != node limit %u", + dx_get_limit(entries), dx_node_limit(dir)); goto fail; } } @@ -858,18 +849,17 @@ fail: } if (ret_err == ERR_PTR(ERR_BAD_DX_DIR)) - ext4_warning(dir->i_sb, - "Corrupt dir inode %lu, running e2fsck is " - "recommended.", dir->i_ino); + ext4_warning_inode(dir, + "Corrupt directory, running e2fsck is recommended"); return ret_err; } -static void dx_release (struct dx_frame *frames) +static void dx_release(struct dx_frame *frames) { if (frames[0].bh == NULL) return; - if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels) + if (((struct dx_root *)frames[0].bh->b_data)->info.indirect_levels) brelse(frames[1].bh); brelse(frames[0].bh); } @@ -962,7 +952,6 @@ static int htree_dirblock_to_tree(struct file *dir_file, struct buffer_head *bh; struct ext4_dir_entry_2 *de, *top; int err = 0, count = 0; - struct ext4_fname_crypto_ctx *ctx = NULL; struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}, tmp_str; dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n", @@ -977,17 +966,15 @@ static int htree_dirblock_to_tree(struct file *dir_file, EXT4_DIR_REC_LEN(0)); #ifdef CONFIG_EXT4_FS_ENCRYPTION /* Check if the directory is encrypted */ - ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN); - if (IS_ERR(ctx)) { - err = PTR_ERR(ctx); - brelse(bh); - return err; - } - if (ctx != NULL) { - err = ext4_fname_crypto_alloc_buffer(ctx, EXT4_NAME_LEN, + if (ext4_encrypted_inode(dir)) { + err = ext4_get_encryption_info(dir); + if (err < 0) { + brelse(bh); + return err; + } + err = ext4_fname_crypto_alloc_buffer(dir, EXT4_NAME_LEN, &fname_crypto_str); if (err < 0) { - ext4_put_fname_crypto_ctx(&ctx); brelse(bh); return err; } @@ -1008,16 +995,17 @@ static int htree_dirblock_to_tree(struct file *dir_file, continue; if (de->inode == 0) continue; - if (ctx == NULL) { - /* Directory is not encrypted */ + if (!ext4_encrypted_inode(dir)) { tmp_str.name = de->name; tmp_str.len = de->name_len; err = ext4_htree_store_dirent(dir_file, hinfo->hash, hinfo->minor_hash, de, &tmp_str); } else { + int save_len = fname_crypto_str.len; + /* Directory is encrypted */ - err = ext4_fname_disk_to_usr(ctx, hinfo, de, + err = ext4_fname_disk_to_usr(dir, hinfo, de, &fname_crypto_str); if (err < 0) { count = err; @@ -1026,6 +1014,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, err = ext4_htree_store_dirent(dir_file, hinfo->hash, hinfo->minor_hash, de, &fname_crypto_str); + fname_crypto_str.len = save_len; } if (err != 0) { count = err; @@ -1036,7 +1025,6 @@ static int htree_dirblock_to_tree(struct file *dir_file, errout: brelse(bh); #ifdef CONFIG_EXT4_FS_ENCRYPTION - ext4_put_fname_crypto_ctx(&ctx); ext4_fname_crypto_free_buffer(&fname_crypto_str); #endif return count; @@ -1155,12 +1143,13 @@ errout: static inline int search_dirblock(struct buffer_head *bh, struct inode *dir, + struct ext4_filename *fname, const struct qstr *d_name, unsigned int offset, struct ext4_dir_entry_2 **res_dir) { - return search_dir(bh, bh->b_data, dir->i_sb->s_blocksize, dir, - d_name, offset, res_dir); + return ext4_search_dir(bh, bh->b_data, dir->i_sb->s_blocksize, dir, + fname, d_name, offset, res_dir); } /* @@ -1242,54 +1231,54 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block) * `len <= EXT4_NAME_LEN' is guaranteed by caller. * `de != NULL' is guaranteed by caller. */ -static inline int ext4_match(struct ext4_fname_crypto_ctx *ctx, - struct ext4_str *fname_crypto_str, - int len, const char * const name, +static inline int ext4_match(struct ext4_filename *fname, struct ext4_dir_entry_2 *de) { - int res; + const void *name = fname_name(fname); + u32 len = fname_len(fname); if (!de->inode) return 0; #ifdef CONFIG_EXT4_FS_ENCRYPTION - if (ctx) - return ext4_fname_match(ctx, fname_crypto_str, len, name, de); + if (unlikely(!name)) { + if (fname->usr_fname->name[0] == '_') { + int ret; + if (de->name_len < 16) + return 0; + ret = memcmp(de->name + de->name_len - 16, + fname->crypto_buf.name + 8, 16); + return (ret == 0) ? 1 : 0; + } + name = fname->crypto_buf.name; + len = fname->crypto_buf.len; + } #endif - if (len != de->name_len) + if (de->name_len != len) return 0; - res = memcmp(name, de->name, len); - return (res == 0) ? 1 : 0; + return (memcmp(de->name, name, len) == 0) ? 1 : 0; } /* * Returns 0 if not found, -1 on failure, and 1 on success */ -int search_dir(struct buffer_head *bh, char *search_buf, int buf_size, - struct inode *dir, const struct qstr *d_name, - unsigned int offset, struct ext4_dir_entry_2 **res_dir) +int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size, + struct inode *dir, struct ext4_filename *fname, + const struct qstr *d_name, + unsigned int offset, struct ext4_dir_entry_2 **res_dir) { struct ext4_dir_entry_2 * de; char * dlimit; int de_len; - const char *name = d_name->name; - int namelen = d_name->len; - struct ext4_fname_crypto_ctx *ctx = NULL; - struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}; int res; - ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN); - if (IS_ERR(ctx)) - return -1; - de = (struct ext4_dir_entry_2 *)search_buf; dlimit = search_buf + buf_size; while ((char *) de < dlimit) { /* this code is executed quadratically often */ /* do minimal checking `by hand' */ if ((char *) de + de->name_len <= dlimit) { - res = ext4_match(ctx, &fname_crypto_str, namelen, - name, de); + res = ext4_match(fname, de); if (res < 0) { res = -1; goto return_result; @@ -1322,8 +1311,6 @@ int search_dir(struct buffer_head *bh, char *search_buf, int buf_size, res = 0; return_result: - ext4_put_fname_crypto_ctx(&ctx); - ext4_fname_crypto_free_buffer(&fname_crypto_str); return res; } @@ -1370,7 +1357,8 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, buffer */ int num = 0; ext4_lblk_t nblocks; - int i, namelen; + int i, namelen, retval; + struct ext4_filename fname; *res_dir = NULL; sb = dir->i_sb; @@ -1378,14 +1366,18 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, if (namelen > EXT4_NAME_LEN) return NULL; + retval = ext4_fname_setup_filename(dir, d_name, 1, &fname); + if (retval) + return ERR_PTR(retval); + if (ext4_has_inline_data(dir)) { int has_inline_data = 1; - ret = ext4_find_inline_entry(dir, d_name, res_dir, + ret = ext4_find_inline_entry(dir, &fname, d_name, res_dir, &has_inline_data); if (has_inline_data) { if (inlined) *inlined = 1; - return ret; + goto cleanup_and_exit; } } @@ -1400,14 +1392,14 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, goto restart; } if (is_dx(dir)) { - bh = ext4_dx_find_entry(dir, d_name, res_dir); + ret = ext4_dx_find_entry(dir, &fname, res_dir); /* * On success, or if the error was file not found, * return. Otherwise, fall back to doing a search the * old fashioned way. */ - if (!IS_ERR(bh) || PTR_ERR(bh) != ERR_BAD_DX_DIR) - return bh; + if (!IS_ERR(ret) || PTR_ERR(ret) != ERR_BAD_DX_DIR) + goto cleanup_and_exit; dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " "falling back\n")); } @@ -1438,8 +1430,10 @@ restart: num++; bh = ext4_getblk(NULL, dir, b++, 0); if (unlikely(IS_ERR(bh))) { - if (ra_max == 0) - return bh; + if (ra_max == 0) { + ret = bh; + goto cleanup_and_exit; + } break; } bh_use[ra_max] = bh; @@ -1469,7 +1463,7 @@ restart: goto next; } set_buffer_verified(bh); - i = search_dirblock(bh, dir, d_name, + i = search_dirblock(bh, dir, &fname, d_name, block << EXT4_BLOCK_SIZE_BITS(sb), res_dir); if (i == 1) { EXT4_I(dir)->i_dir_start_lookup = block; @@ -1500,15 +1494,17 @@ cleanup_and_exit: /* Clean up the read-ahead blocks */ for (; ra_ptr < ra_max; ra_ptr++) brelse(bh_use[ra_ptr]); + ext4_fname_free_filename(&fname); return ret; } -static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name, - struct ext4_dir_entry_2 **res_dir) +static struct buffer_head * ext4_dx_find_entry(struct inode *dir, + struct ext4_filename *fname, + struct ext4_dir_entry_2 **res_dir) { struct super_block * sb = dir->i_sb; - struct dx_hash_info hinfo; struct dx_frame frames[2], *frame; + const struct qstr *d_name = fname->usr_fname; struct buffer_head *bh; ext4_lblk_t block; int retval; @@ -1516,7 +1512,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q #ifdef CONFIG_EXT4_FS_ENCRYPTION *res_dir = NULL; #endif - frame = dx_probe(d_name, dir, &hinfo, frames); + frame = dx_probe(fname, dir, NULL, frames); if (IS_ERR(frame)) return (struct buffer_head *) frame; do { @@ -1525,7 +1521,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q if (IS_ERR(bh)) goto errout; - retval = search_dirblock(bh, dir, d_name, + retval = search_dirblock(bh, dir, fname, d_name, block << EXT4_BLOCK_SIZE_BITS(sb), res_dir); if (retval == 1) @@ -1537,12 +1533,12 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q } /* Check to see if we should continue to search */ - retval = ext4_htree_next_block(dir, hinfo.hash, frame, + retval = ext4_htree_next_block(dir, fname->hinfo.hash, frame, frames, NULL); if (retval < 0) { - ext4_warning(sb, - "error %d reading index page in directory #%lu", - retval, dir->i_ino); + ext4_warning_inode(dir, + "error %d reading directory index block", + retval); bh = ERR_PTR(retval); goto errout; } @@ -1796,32 +1792,16 @@ journal_error: int ext4_find_dest_de(struct inode *dir, struct inode *inode, struct buffer_head *bh, void *buf, int buf_size, - const char *name, int namelen, + struct ext4_filename *fname, struct ext4_dir_entry_2 **dest_de) { struct ext4_dir_entry_2 *de; - unsigned short reclen = EXT4_DIR_REC_LEN(namelen); + unsigned short reclen = EXT4_DIR_REC_LEN(fname_len(fname)); int nlen, rlen; unsigned int offset = 0; char *top; - struct ext4_fname_crypto_ctx *ctx = NULL; - struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}; int res; - ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN); - if (IS_ERR(ctx)) - return -1; - - if (ctx != NULL) { - /* Calculate record length needed to store the entry */ - res = ext4_fname_crypto_namelen_on_disk(ctx, namelen); - if (res < 0) { - ext4_put_fname_crypto_ctx(&ctx); - return res; - } - reclen = EXT4_DIR_REC_LEN(res); - } - de = (struct ext4_dir_entry_2 *)buf; top = buf + buf_size - reclen; while ((char *) de <= top) { @@ -1831,7 +1811,7 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, goto return_result; } /* Provide crypto context and crypto buffer to ext4 match */ - res = ext4_match(ctx, &fname_crypto_str, namelen, name, de); + res = ext4_match(fname, de); if (res < 0) goto return_result; if (res > 0) { @@ -1853,8 +1833,6 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, res = 0; } return_result: - ext4_put_fname_crypto_ctx(&ctx); - ext4_fname_crypto_free_buffer(&fname_crypto_str); return res; } @@ -1862,39 +1840,10 @@ int ext4_insert_dentry(struct inode *dir, struct inode *inode, struct ext4_dir_entry_2 *de, int buf_size, - const struct qstr *iname, - const char *name, int namelen) + struct ext4_filename *fname) { int nlen, rlen; - struct ext4_fname_crypto_ctx *ctx = NULL; - struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}; - struct ext4_str tmp_str; - int res; - - ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN); - if (IS_ERR(ctx)) - return -EIO; - /* By default, the input name would be written to the disk */ - tmp_str.name = (unsigned char *)name; - tmp_str.len = namelen; - if (ctx != NULL) { - /* Directory is encrypted */ - res = ext4_fname_crypto_alloc_buffer(ctx, EXT4_NAME_LEN, - &fname_crypto_str); - if (res < 0) { - ext4_put_fname_crypto_ctx(&ctx); - return -ENOMEM; - } - res = ext4_fname_usr_to_disk(ctx, iname, &fname_crypto_str); - if (res < 0) { - ext4_put_fname_crypto_ctx(&ctx); - ext4_fname_crypto_free_buffer(&fname_crypto_str); - return res; - } - tmp_str.name = fname_crypto_str.name; - tmp_str.len = fname_crypto_str.len; - } nlen = EXT4_DIR_REC_LEN(de->name_len); rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); @@ -1908,11 +1857,8 @@ int ext4_insert_dentry(struct inode *dir, de->file_type = EXT4_FT_UNKNOWN; de->inode = cpu_to_le32(inode->i_ino); ext4_set_de_type(inode->i_sb, de, inode->i_mode); - de->name_len = tmp_str.len; - - memcpy(de->name, tmp_str.name, tmp_str.len); - ext4_put_fname_crypto_ctx(&ctx); - ext4_fname_crypto_free_buffer(&fname_crypto_str); + de->name_len = fname_len(fname); + memcpy(de->name, fname_name(fname), fname_len(fname)); return 0; } @@ -1924,13 +1870,11 @@ int ext4_insert_dentry(struct inode *dir, * space. It will return -ENOSPC if no space is available, and -EIO * and -EEXIST if directory entry already exists. */ -static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, +static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, + struct inode *dir, struct inode *inode, struct ext4_dir_entry_2 *de, struct buffer_head *bh) { - struct inode *dir = d_inode(dentry->d_parent); - const char *name = dentry->d_name.name; - int namelen = dentry->d_name.len; unsigned int blocksize = dir->i_sb->s_blocksize; int csum_size = 0; int err; @@ -1939,9 +1883,8 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, csum_size = sizeof(struct ext4_dir_entry_tail); if (!de) { - err = ext4_find_dest_de(dir, inode, - bh, bh->b_data, blocksize - csum_size, - name, namelen, &de); + err = ext4_find_dest_de(dir, inode, bh, bh->b_data, + blocksize - csum_size, fname, &de); if (err) return err; } @@ -1954,8 +1897,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, /* By now the buffer is marked for journaling. Due to crypto operations, * the following function call may fail */ - err = ext4_insert_dentry(dir, inode, de, blocksize, &dentry->d_name, - name, namelen); + err = ext4_insert_dentry(dir, inode, de, blocksize, fname); if (err < 0) return err; @@ -1985,17 +1927,11 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, * This converts a one block unindexed directory to a 3 block indexed * directory, and adds the dentry to the indexed directory. */ -static int make_indexed_dir(handle_t *handle, struct dentry *dentry, +static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, + struct dentry *dentry, struct inode *inode, struct buffer_head *bh) { struct inode *dir = d_inode(dentry->d_parent); -#ifdef CONFIG_EXT4_FS_ENCRYPTION - struct ext4_fname_crypto_ctx *ctx = NULL; - int res; -#else - const char *name = dentry->d_name.name; - int namelen = dentry->d_name.len; -#endif struct buffer_head *bh2; struct dx_root *root; struct dx_frame frames[2], *frame; @@ -2006,17 +1942,10 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, unsigned len; int retval; unsigned blocksize; - struct dx_hash_info hinfo; ext4_lblk_t block; struct fake_dirent *fde; int csum_size = 0; -#ifdef CONFIG_EXT4_FS_ENCRYPTION - ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN); - if (IS_ERR(ctx)) - return PTR_ERR(ctx); -#endif - if (ext4_has_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); @@ -2078,22 +2007,12 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, dx_set_limit(entries, dx_root_limit(dir, sizeof(root->info))); /* Initialize as for dx_probe */ - hinfo.hash_version = root->info.hash_version; - if (hinfo.hash_version <= DX_HASH_TEA) - hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; - hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; -#ifdef CONFIG_EXT4_FS_ENCRYPTION - res = ext4_fname_usr_to_hash(ctx, &dentry->d_name, &hinfo); - if (res < 0) { - ext4_put_fname_crypto_ctx(&ctx); - ext4_mark_inode_dirty(handle, dir); - brelse(bh); - return res; - } - ext4_put_fname_crypto_ctx(&ctx); -#else - ext4fs_dirhash(name, namelen, &hinfo); -#endif + fname->hinfo.hash_version = root->info.hash_version; + if (fname->hinfo.hash_version <= DX_HASH_TEA) + fname->hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; + fname->hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; + ext4fs_dirhash(fname_name(fname), fname_len(fname), &fname->hinfo); + memset(frames, 0, sizeof(frames)); frame = frames; frame->entries = entries; @@ -2108,14 +2027,14 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, if (retval) goto out_frames; - de = do_split(handle,dir, &bh, frame, &hinfo); + de = do_split(handle,dir, &bh, frame, &fname->hinfo); if (IS_ERR(de)) { retval = PTR_ERR(de); goto out_frames; } dx_release(frames); - retval = add_dirent_to_buf(handle, dentry, inode, de, bh); + retval = add_dirent_to_buf(handle, fname, dir, inode, de, bh); brelse(bh); return retval; out_frames: @@ -2147,6 +2066,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, struct ext4_dir_entry_2 *de; struct ext4_dir_entry_tail *t; struct super_block *sb; + struct ext4_filename fname; int retval; int dx_fallback=0; unsigned blocksize; @@ -2161,10 +2081,15 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, if (!dentry->d_name.len) return -EINVAL; + retval = ext4_fname_setup_filename(dir, &dentry->d_name, 0, &fname); + if (retval) + return retval; + if (ext4_has_inline_data(dir)) { - retval = ext4_try_add_inline_entry(handle, dentry, inode); + retval = ext4_try_add_inline_entry(handle, &fname, + dentry, inode); if (retval < 0) - return retval; + goto out; if (retval == 1) { retval = 0; goto out; @@ -2172,7 +2097,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, } if (is_dx(dir)) { - retval = ext4_dx_add_entry(handle, dentry, inode); + retval = ext4_dx_add_entry(handle, &fname, dentry, inode); if (!retval || (retval != ERR_BAD_DX_DIR)) goto out; ext4_clear_inode_flag(dir, EXT4_INODE_INDEX); @@ -2182,24 +2107,31 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, blocks = dir->i_size >> sb->s_blocksize_bits; for (block = 0; block < blocks; block++) { bh = ext4_read_dirblock(dir, block, DIRENT); - if (IS_ERR(bh)) - return PTR_ERR(bh); - - retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh); + if (IS_ERR(bh)) { + retval = PTR_ERR(bh); + bh = NULL; + goto out; + } + retval = add_dirent_to_buf(handle, &fname, dir, inode, + NULL, bh); if (retval != -ENOSPC) goto out; if (blocks == 1 && !dx_fallback && EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) { - retval = make_indexed_dir(handle, dentry, inode, bh); + retval = make_indexed_dir(handle, &fname, dentry, + inode, bh); bh = NULL; /* make_indexed_dir releases bh */ goto out; } brelse(bh); } bh = ext4_append(handle, dir, &block); - if (IS_ERR(bh)) - return PTR_ERR(bh); + if (IS_ERR(bh)) { + retval = PTR_ERR(bh); + bh = NULL; + goto out; + } de = (struct ext4_dir_entry_2 *) bh->b_data; de->inode = 0; de->rec_len = ext4_rec_len_to_disk(blocksize - csum_size, blocksize); @@ -2209,8 +2141,9 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, initialize_dirent_tail(t, blocksize); } - retval = add_dirent_to_buf(handle, dentry, inode, de, bh); + retval = add_dirent_to_buf(handle, &fname, dir, inode, de, bh); out: + ext4_fname_free_filename(&fname); brelse(bh); if (retval == 0) ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY); @@ -2220,19 +2153,18 @@ out: /* * Returns 0 for success, or a negative error value */ -static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, - struct inode *inode) +static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, + struct dentry *dentry, struct inode *inode) { struct dx_frame frames[2], *frame; struct dx_entry *entries, *at; - struct dx_hash_info hinfo; struct buffer_head *bh; struct inode *dir = d_inode(dentry->d_parent); struct super_block *sb = dir->i_sb; struct ext4_dir_entry_2 *de; int err; - frame = dx_probe(&dentry->d_name, dir, &hinfo, frames); + frame = dx_probe(fname, dir, NULL, frames); if (IS_ERR(frame)) return PTR_ERR(frame); entries = frame->entries; @@ -2249,7 +2181,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, if (err) goto journal_error; - err = add_dirent_to_buf(handle, dentry, inode, NULL, bh); + err = add_dirent_to_buf(handle, fname, dir, inode, NULL, bh); if (err != -ENOSPC) goto cleanup; @@ -2267,7 +2199,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, if (levels && (dx_get_count(frames->entries) == dx_get_limit(frames->entries))) { - ext4_warning(sb, "Directory index full!"); + ext4_warning_inode(dir, "Directory index full!"); err = -ENOSPC; goto cleanup; } @@ -2345,12 +2277,12 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, goto cleanup; } } - de = do_split(handle, dir, &bh, frame, &hinfo); + de = do_split(handle, dir, &bh, frame, &fname->hinfo); if (IS_ERR(de)) { err = PTR_ERR(de); goto cleanup; } - err = add_dirent_to_buf(handle, dentry, inode, de, bh); + err = add_dirent_to_buf(handle, fname, dir, inode, de, bh); goto cleanup; journal_error: @@ -2517,20 +2449,7 @@ retry: inode->i_op = &ext4_file_inode_operations; inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); - err = 0; -#ifdef CONFIG_EXT4_FS_ENCRYPTION - if (!err && (ext4_encrypted_inode(dir) || - DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb)))) { - err = ext4_inherit_context(dir, inode); - if (err) { - clear_nlink(inode); - unlock_new_inode(inode); - iput(inode); - } - } -#endif - if (!err) - err = ext4_add_nondir(handle, dentry, inode); + err = ext4_add_nondir(handle, dentry, inode); if (!err && IS_DIRSYNC(dir)) ext4_handle_sync(handle); } @@ -2711,14 +2630,6 @@ retry: err = ext4_init_new_dir(handle, dir, inode); if (err) goto out_clear_inode; -#ifdef CONFIG_EXT4_FS_ENCRYPTION - if (ext4_encrypted_inode(dir) || - DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb))) { - err = ext4_inherit_context(dir, inode); - if (err) - goto out_clear_inode; - } -#endif err = ext4_mark_inode_dirty(handle, inode); if (!err) err = ext4_add_entry(handle, dentry, inode); @@ -2779,12 +2690,9 @@ int ext4_empty_dir(struct inode *inode) de = (struct ext4_dir_entry_2 *) bh->b_data; de1 = ext4_next_entry(de, sb->s_blocksize); if (le32_to_cpu(de->inode) != inode->i_ino || - !le32_to_cpu(de1->inode) || - strcmp(".", de->name) || - strcmp("..", de1->name)) { - ext4_warning(inode->i_sb, - "bad directory (dir #%lu) - no `.' or `..'", - inode->i_ino); + le32_to_cpu(de1->inode) == 0 || + strcmp(".", de->name) || strcmp("..", de1->name)) { + ext4_warning_inode(inode, "directory missing '.' and/or '..'"); brelse(bh); return 1; } @@ -3037,8 +2945,9 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) if (retval) goto end_rmdir; if (!EXT4_DIR_LINK_EMPTY(inode)) - ext4_warning(inode->i_sb, - "empty directory has too many links (%d)", + ext4_warning_inode(inode, + "empty directory '%.*s' has too many links (%u)", + dentry->d_name.len, dentry->d_name.name, inode->i_nlink); inode->i_version++; clear_nlink(inode); @@ -3098,10 +3007,9 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); - if (!inode->i_nlink) { - ext4_warning(inode->i_sb, - "Deleting nonexistent file (%lu), %d", - inode->i_ino, inode->i_nlink); + if (inode->i_nlink == 0) { + ext4_warning_inode(inode, "Deleting file '%.*s' with no links", + dentry->d_name.len, dentry->d_name.name); set_nlink(inode, 1); } retval = ext4_delete_entry(handle, dir, de, bh); @@ -3140,10 +3048,23 @@ static int ext4_symlink(struct inode *dir, encryption_required = (ext4_encrypted_inode(dir) || DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb))); - if (encryption_required) - disk_link.len = encrypted_symlink_data_len(len) + 1; - if (disk_link.len > dir->i_sb->s_blocksize) - return -ENAMETOOLONG; + if (encryption_required) { + err = ext4_get_encryption_info(dir); + if (err) + return err; + if (ext4_encryption_info(dir) == NULL) + return -EPERM; + disk_link.len = (ext4_fname_encrypted_size(dir, len) + + sizeof(struct ext4_encrypted_symlink_data)); + sd = kzalloc(disk_link.len, GFP_KERNEL); + if (!sd) + return -ENOMEM; + } + + if (disk_link.len > dir->i_sb->s_blocksize) { + err = -ENAMETOOLONG; + goto err_free_sd; + } dquot_initialize(dir); @@ -3174,34 +3095,19 @@ static int ext4_symlink(struct inode *dir, if (IS_ERR(inode)) { if (handle) ext4_journal_stop(handle); - return PTR_ERR(inode); + err = PTR_ERR(inode); + goto err_free_sd; } if (encryption_required) { - struct ext4_fname_crypto_ctx *ctx = NULL; struct qstr istr; struct ext4_str ostr; - sd = kzalloc(disk_link.len, GFP_NOFS); - if (!sd) { - err = -ENOMEM; - goto err_drop_inode; - } - err = ext4_inherit_context(dir, inode); - if (err) - goto err_drop_inode; - ctx = ext4_get_fname_crypto_ctx(inode, - inode->i_sb->s_blocksize); - if (IS_ERR_OR_NULL(ctx)) { - /* We just set the policy, so ctx should not be NULL */ - err = (ctx == NULL) ? -EIO : PTR_ERR(ctx); - goto err_drop_inode; - } istr.name = (const unsigned char *) symname; istr.len = len; ostr.name = sd->encrypted_path; - err = ext4_fname_usr_to_disk(ctx, &istr, &ostr); - ext4_put_fname_crypto_ctx(&ctx); + ostr.len = disk_link.len; + err = ext4_fname_usr_to_disk(inode, &istr, &ostr); if (err < 0) goto err_drop_inode; sd->len = cpu_to_le16(ostr.len); @@ -3271,10 +3177,11 @@ static int ext4_symlink(struct inode *dir, err_drop_inode: if (handle) ext4_journal_stop(handle); - kfree(sd); clear_nlink(inode); unlock_new_inode(inode); iput(inode); +err_free_sd: + kfree(sd); return err; } @@ -3490,9 +3397,9 @@ static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent, } if (retval) { - ext4_warning(ent->dir->i_sb, - "Deleting old file (%lu), %d, error=%d", - ent->dir->i_ino, ent->dir->i_nlink, retval); + ext4_warning_inode(ent->dir, + "Deleting old file: nlink %d, error=%d", + ent->dir->i_nlink, retval); } } @@ -3762,6 +3669,15 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, u8 new_file_type; int retval; + if ((ext4_encrypted_inode(old_dir) || + ext4_encrypted_inode(new_dir)) && + (old_dir != new_dir) && + (!ext4_is_child_context_consistent_with_parent(new_dir, + old.inode) || + !ext4_is_child_context_consistent_with_parent(old_dir, + new.inode))) + return -EPERM; + dquot_initialize(old.dir); dquot_initialize(new.dir); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 5765f88..5602450 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -84,7 +84,7 @@ static void ext4_finish_bio(struct bio *bio) /* The bounce data pages are unmapped. */ data_page = page; ctx = (struct ext4_crypto_ctx *)page_private(data_page); - page = ctx->control_page; + page = ctx->w.control_page; } #endif @@ -359,7 +359,6 @@ void ext4_io_submit(struct ext4_io_submit *io) if (bio) { bio_get(io->io_bio); submit_bio(io->io_op, io->io_bio); - BUG_ON(bio_flagged(io->io_bio, BIO_EOPNOTSUPP)); bio_put(io->io_bio); } io->io_bio = NULL; diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 171b9ac..ec3ef93 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -54,8 +54,8 @@ static void completion_pages(struct work_struct *work) { #ifdef CONFIG_EXT4_FS_ENCRYPTION struct ext4_crypto_ctx *ctx = - container_of(work, struct ext4_crypto_ctx, work); - struct bio *bio = ctx->bio; + container_of(work, struct ext4_crypto_ctx, r.work); + struct bio *bio = ctx->r.bio; struct bio_vec *bv; int i; @@ -109,9 +109,9 @@ static void mpage_end_io(struct bio *bio, int err) if (err) { ext4_release_crypto_ctx(ctx); } else { - INIT_WORK(&ctx->work, completion_pages); - ctx->bio = bio; - queue_work(ext4_read_workqueue, &ctx->work); + INIT_WORK(&ctx->r.work, completion_pages); + ctx->r.bio = bio; + queue_work(ext4_read_workqueue, &ctx->r.work); return; } } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 5f3c43a..5c78764 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -24,6 +24,7 @@ #include <linux/slab.h> #include <linux/init.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <linux/parser.h> #include <linux/buffer_head.h> #include <linux/exportfs.h> @@ -591,14 +592,17 @@ void __ext4_msg(struct super_block *sb, va_end(args); } +#define ext4_warning_ratelimit(sb) \ + ___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state), \ + "EXT4-fs warning") + void __ext4_warning(struct super_block *sb, const char *function, unsigned int line, const char *fmt, ...) { struct va_format vaf; va_list args; - if (!___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state), - "EXT4-fs warning")) + if (!ext4_warning_ratelimit(sb)) return; va_start(args, fmt); @@ -609,6 +613,24 @@ void __ext4_warning(struct super_block *sb, const char *function, va_end(args); } +void __ext4_warning_inode(const struct inode *inode, const char *function, + unsigned int line, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + if (!ext4_warning_ratelimit(inode->i_sb)) + return; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: " + "inode #%lu: comm %s: %pV\n", inode->i_sb->s_id, + function, line, inode->i_ino, current->comm, &vaf); + va_end(args); +} + void __ext4_grp_locked_error(const char *function, unsigned int line, struct super_block *sb, ext4_group_t grp, unsigned long ino, ext4_fsblk_t block, @@ -807,6 +829,7 @@ static void ext4_put_super(struct super_block *sb) dump_orphan_list(sb, sbi); J_ASSERT(list_empty(&sbi->s_orphan)); + sync_blockdev(sb->s_bdev); invalidate_bdev(sb->s_bdev); if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) { /* @@ -879,9 +902,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) atomic_set(&ei->i_unwritten, 0); INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work); #ifdef CONFIG_EXT4_FS_ENCRYPTION - ei->i_encryption_key.mode = EXT4_ENCRYPTION_MODE_INVALID; + ei->i_crypt_info = NULL; #endif - return &ei->vfs_inode; } @@ -958,6 +980,10 @@ void ext4_clear_inode(struct inode *inode) jbd2_free_inode(EXT4_I(inode)->jinode); EXT4_I(inode)->jinode = NULL; } +#ifdef CONFIG_EXT4_FS_ENCRYPTION + if (EXT4_I(inode)->i_crypt_info) + ext4_free_encryption_info(inode, EXT4_I(inode)->i_crypt_info); +#endif } static struct inode *ext4_nfs_get_inode(struct super_block *sb, @@ -3448,11 +3474,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (sb->s_bdev->bd_part) sbi->s_sectors_written_start = part_stat_read(sb->s_bdev->bd_part, sectors[1]); -#ifdef CONFIG_EXT4_FS_ENCRYPTION - /* Modes of operations for file and directory encryption. */ - sbi->s_file_encryption_mode = EXT4_ENCRYPTION_MODE_AES_256_XTS; - sbi->s_dir_encryption_mode = EXT4_ENCRYPTION_MODE_INVALID; -#endif /* Cleanup superblock name */ strreplace(sb->s_id, '/', '!'); @@ -4065,7 +4086,15 @@ no_journal: } } - if (unlikely(sbi->s_mount_flags & EXT4_MF_TEST_DUMMY_ENCRYPTION) && + if ((DUMMY_ENCRYPTION_ENABLED(sbi) || + EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT)) && + (blocksize != PAGE_CACHE_SIZE)) { + ext4_msg(sb, KERN_ERR, + "Unsupported blocksize for fs encryption"); + goto failed_mount_wq; + } + + if (DUMMY_ENCRYPTION_ENABLED(sbi) && !(sb->s_flags & MS_RDONLY) && !EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT)) { EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT); @@ -4941,6 +4970,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); } + if (*flags & MS_LAZYTIME) + sb->s_flags |= MS_LAZYTIME; + if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) { if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) { err = -EROFS; @@ -5408,6 +5440,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, struct inode *inode = sb_dqopt(sb)->files[type]; ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); int err, offset = off & (sb->s_blocksize - 1); + int retries = 0; struct buffer_head *bh; handle_t *handle = journal_current_handle(); @@ -5428,7 +5461,12 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, return -EIO; } - bh = ext4_bread(handle, inode, blk, 1); + do { + bh = ext4_bread(handle, inode, blk, + EXT4_GET_BLOCKS_CREATE | + EXT4_GET_BLOCKS_METADATA_NOFAIL); + } while (IS_ERR(bh) && (PTR_ERR(bh) == -ENOSPC) && + ext4_should_retry_alloc(inode->i_sb, &retries)); if (IS_ERR(bh)) return PTR_ERR(bh); if (!bh) @@ -5645,6 +5683,7 @@ out7: static void __exit ext4_exit_fs(void) { + ext4_exit_crypto(); ext4_destroy_lazyinit_thread(); unregister_as_ext2(); unregister_as_ext3(); diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index ba5bd18..c677f2c 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -23,31 +23,28 @@ #include "xattr.h" #ifdef CONFIG_EXT4_FS_ENCRYPTION -static const char *ext4_follow_link(struct dentry *dentry, void **cookie) +static const char *ext4_encrypted_follow_link(struct dentry *dentry, void **cookie) { struct page *cpage = NULL; char *caddr, *paddr = NULL; struct ext4_str cstr, pstr; struct inode *inode = d_inode(dentry); - struct ext4_fname_crypto_ctx *ctx = NULL; struct ext4_encrypted_symlink_data *sd; loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1); int res; u32 plen, max_size = inode->i_sb->s_blocksize; - ctx = ext4_get_fname_crypto_ctx(inode, inode->i_sb->s_blocksize); - if (IS_ERR(ctx)) - return ERR_CAST(ctx); + res = ext4_get_encryption_info(inode); + if (res) + return ERR_PTR(res); if (ext4_inode_is_fast_symlink(inode)) { caddr = (char *) EXT4_I(inode)->i_data; max_size = sizeof(EXT4_I(inode)->i_data); } else { cpage = read_mapping_page(inode->i_mapping, 0, NULL); - if (IS_ERR(cpage)) { - ext4_put_fname_crypto_ctx(&ctx); + if (IS_ERR(cpage)) return ERR_CAST(cpage); - } caddr = kmap(cpage); caddr[size] = 0; } @@ -71,20 +68,19 @@ static const char *ext4_follow_link(struct dentry *dentry, void **cookie) goto errout; } pstr.name = paddr; - res = _ext4_fname_disk_to_usr(ctx, NULL, &cstr, &pstr); + pstr.len = plen; + res = _ext4_fname_disk_to_usr(inode, NULL, &cstr, &pstr); if (res < 0) goto errout; /* Null-terminate the name */ if (res <= plen) paddr[res] = '\0'; - ext4_put_fname_crypto_ctx(&ctx); if (cpage) { kunmap(cpage); page_cache_release(cpage); } return *cookie = paddr; errout: - ext4_put_fname_crypto_ctx(&ctx); if (cpage) { kunmap(cpage); page_cache_release(cpage); @@ -95,7 +91,7 @@ errout: const struct inode_operations ext4_encrypted_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = ext4_follow_link, + .follow_link = ext4_encrypted_follow_link, .put_link = kfree_put_link, .setattr = ext4_setattr, .setxattr = generic_setxattr, diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index d9c5242..7dd63b7 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -53,7 +53,7 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) PAGE_CACHE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); } else if (type == DIRTY_DENTS) { - if (sbi->sb->s_bdi->dirty_exceeded) + if (sbi->sb->s_bdi->wb.dirty_exceeded) return false; mem_size = get_pages(sbi, F2FS_DIRTY_DENTS); res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); @@ -70,7 +70,7 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) sizeof(struct extent_node)) >> PAGE_CACHE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); } else { - if (sbi->sb->s_bdi->dirty_exceeded) + if (sbi->sb->s_bdi->wb.dirty_exceeded) return false; } return res; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 8496357..79e7b87 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -9,6 +9,7 @@ * published by the Free Software Foundation. */ #include <linux/blkdev.h> +#include <linux/backing-dev.h> /* constant macro */ #define NULL_SEGNO ((unsigned int)(~0)) @@ -714,7 +715,7 @@ static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi) */ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type) { - if (sbi->sb->s_bdi->dirty_exceeded) + if (sbi->sb->s_bdi->wb.dirty_exceeded) return 0; if (type == DATA) diff --git a/fs/fat/file.c b/fs/fat/file.c index 442d50a..a08f103 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -11,6 +11,7 @@ #include <linux/compat.h> #include <linux/mount.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <linux/fsnotify.h> #include <linux/security.h> #include "fat.h" diff --git a/fs/fat/inode.c b/fs/fat/inode.c index c067746..509411d 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -18,6 +18,7 @@ #include <linux/parser.h> #include <linux/uio.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <asm/unaligned.h> #include "fat.h" diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 32a8bbd..f0520bc 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -27,6 +27,7 @@ #include <linux/backing-dev.h> #include <linux/tracepoint.h> #include <linux/device.h> +#include <linux/memcontrol.h> #include "internal.h" /* @@ -34,6 +35,10 @@ */ #define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_CACHE_SHIFT - 10)) +struct wb_completion { + atomic_t cnt; +}; + /* * Passed into wb_writeback(), essentially a subset of writeback_control */ @@ -47,13 +52,29 @@ struct wb_writeback_work { unsigned int range_cyclic:1; unsigned int for_background:1; unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ + unsigned int auto_free:1; /* free on completion */ + unsigned int single_wait:1; + unsigned int single_done:1; enum wb_reason reason; /* why was writeback initiated? */ struct list_head list; /* pending work list */ - struct completion *done; /* set if the caller waits */ + struct wb_completion *done; /* set if the caller waits */ }; /* + * If one wants to wait for one or more wb_writeback_works, each work's + * ->done should be set to a wb_completion defined using the following + * macro. Once all work items are issued with wb_queue_work(), the caller + * can wait for the completion of all using wb_wait_for_completion(). Work + * items which are waited upon aren't freed automatically on completion. + */ +#define DEFINE_WB_COMPLETION_ONSTACK(cmpl) \ + struct wb_completion cmpl = { \ + .cnt = ATOMIC_INIT(1), \ + } + + +/* * If an inode is constantly having its pages dirtied, but then the * updates stop dirtytime_expire_interval seconds in the past, it's * possible for the worst case time between when an inode has its @@ -65,35 +86,6 @@ struct wb_writeback_work { */ unsigned int dirtytime_expire_interval = 12 * 60 * 60; -/** - * writeback_in_progress - determine whether there is writeback in progress - * @bdi: the device's backing_dev_info structure. - * - * Determine whether there is writeback waiting to be handled against a - * backing device. - */ -int writeback_in_progress(struct backing_dev_info *bdi) -{ - return test_bit(BDI_writeback_running, &bdi->state); -} -EXPORT_SYMBOL(writeback_in_progress); - -struct backing_dev_info *inode_to_bdi(struct inode *inode) -{ - struct super_block *sb; - - if (!inode) - return &noop_backing_dev_info; - - sb = inode->i_sb; -#ifdef CONFIG_BLOCK - if (sb_is_blkdev_sb(sb)) - return blk_get_backing_dev_info(I_BDEV(inode)); -#endif - return sb->s_bdi; -} -EXPORT_SYMBOL_GPL(inode_to_bdi); - static inline struct inode *wb_inode(struct list_head *head) { return list_entry(head, struct inode, i_wb_list); @@ -109,45 +101,830 @@ static inline struct inode *wb_inode(struct list_head *head) EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage); -static void bdi_wakeup_thread(struct backing_dev_info *bdi) +static bool wb_io_lists_populated(struct bdi_writeback *wb) +{ + if (wb_has_dirty_io(wb)) { + return false; + } else { + set_bit(WB_has_dirty_io, &wb->state); + WARN_ON_ONCE(!wb->avg_write_bandwidth); + atomic_long_add(wb->avg_write_bandwidth, + &wb->bdi->tot_write_bandwidth); + return true; + } +} + +static void wb_io_lists_depopulated(struct bdi_writeback *wb) { - spin_lock_bh(&bdi->wb_lock); - if (test_bit(BDI_registered, &bdi->state)) - mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); - spin_unlock_bh(&bdi->wb_lock); + if (wb_has_dirty_io(wb) && list_empty(&wb->b_dirty) && + list_empty(&wb->b_io) && list_empty(&wb->b_more_io)) { + clear_bit(WB_has_dirty_io, &wb->state); + WARN_ON_ONCE(atomic_long_sub_return(wb->avg_write_bandwidth, + &wb->bdi->tot_write_bandwidth) < 0); + } } -static void bdi_queue_work(struct backing_dev_info *bdi, - struct wb_writeback_work *work) +/** + * inode_wb_list_move_locked - move an inode onto a bdi_writeback IO list + * @inode: inode to be moved + * @wb: target bdi_writeback + * @head: one of @wb->b_{dirty|io|more_io} + * + * Move @inode->i_wb_list to @list of @wb and set %WB_has_dirty_io. + * Returns %true if @inode is the first occupant of the !dirty_time IO + * lists; otherwise, %false. + */ +static bool inode_wb_list_move_locked(struct inode *inode, + struct bdi_writeback *wb, + struct list_head *head) { - trace_writeback_queue(bdi, work); + assert_spin_locked(&wb->list_lock); + + list_move(&inode->i_wb_list, head); - spin_lock_bh(&bdi->wb_lock); - if (!test_bit(BDI_registered, &bdi->state)) { - if (work->done) - complete(work->done); + /* dirty_time doesn't count as dirty_io until expiration */ + if (head != &wb->b_dirty_time) + return wb_io_lists_populated(wb); + + wb_io_lists_depopulated(wb); + return false; +} + +/** + * inode_wb_list_del_locked - remove an inode from its bdi_writeback IO list + * @inode: inode to be removed + * @wb: bdi_writeback @inode is being removed from + * + * Remove @inode which may be on one of @wb->b_{dirty|io|more_io} lists and + * clear %WB_has_dirty_io if all are empty afterwards. + */ +static void inode_wb_list_del_locked(struct inode *inode, + struct bdi_writeback *wb) +{ + assert_spin_locked(&wb->list_lock); + + list_del_init(&inode->i_wb_list); + wb_io_lists_depopulated(wb); +} + +static void wb_wakeup(struct bdi_writeback *wb) +{ + spin_lock_bh(&wb->work_lock); + if (test_bit(WB_registered, &wb->state)) + mod_delayed_work(bdi_wq, &wb->dwork, 0); + spin_unlock_bh(&wb->work_lock); +} + +static void wb_queue_work(struct bdi_writeback *wb, + struct wb_writeback_work *work) +{ + trace_writeback_queue(wb->bdi, work); + + spin_lock_bh(&wb->work_lock); + if (!test_bit(WB_registered, &wb->state)) { + if (work->single_wait) + work->single_done = 1; goto out_unlock; } - list_add_tail(&work->list, &bdi->work_list); - mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); + if (work->done) + atomic_inc(&work->done->cnt); + list_add_tail(&work->list, &wb->work_list); + mod_delayed_work(bdi_wq, &wb->dwork, 0); out_unlock: - spin_unlock_bh(&bdi->wb_lock); + spin_unlock_bh(&wb->work_lock); +} + +/** + * wb_wait_for_completion - wait for completion of bdi_writeback_works + * @bdi: bdi work items were issued to + * @done: target wb_completion + * + * Wait for one or more work items issued to @bdi with their ->done field + * set to @done, which should have been defined with + * DEFINE_WB_COMPLETION_ONSTACK(). This function returns after all such + * work items are completed. Work items which are waited upon aren't freed + * automatically on completion. + */ +static void wb_wait_for_completion(struct backing_dev_info *bdi, + struct wb_completion *done) +{ + atomic_dec(&done->cnt); /* put down the initial count */ + wait_event(bdi->wb_waitq, !atomic_read(&done->cnt)); +} + +#ifdef CONFIG_CGROUP_WRITEBACK + +/* parameters for foreign inode detection, see wb_detach_inode() */ +#define WB_FRN_TIME_SHIFT 13 /* 1s = 2^13, upto 8 secs w/ 16bit */ +#define WB_FRN_TIME_AVG_SHIFT 3 /* avg = avg * 7/8 + new * 1/8 */ +#define WB_FRN_TIME_CUT_DIV 2 /* ignore rounds < avg / 2 */ +#define WB_FRN_TIME_PERIOD (2 * (1 << WB_FRN_TIME_SHIFT)) /* 2s */ + +#define WB_FRN_HIST_SLOTS 16 /* inode->i_wb_frn_history is 16bit */ +#define WB_FRN_HIST_UNIT (WB_FRN_TIME_PERIOD / WB_FRN_HIST_SLOTS) + /* each slot's duration is 2s / 16 */ +#define WB_FRN_HIST_THR_SLOTS (WB_FRN_HIST_SLOTS / 2) + /* if foreign slots >= 8, switch */ +#define WB_FRN_HIST_MAX_SLOTS (WB_FRN_HIST_THR_SLOTS / 2 + 1) + /* one round can affect upto 5 slots */ + +void __inode_attach_wb(struct inode *inode, struct page *page) +{ + struct backing_dev_info *bdi = inode_to_bdi(inode); + struct bdi_writeback *wb = NULL; + + if (inode_cgwb_enabled(inode)) { + struct cgroup_subsys_state *memcg_css; + + if (page) { + memcg_css = mem_cgroup_css_from_page(page); + wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); + } else { + /* must pin memcg_css, see wb_get_create() */ + memcg_css = task_get_css(current, memory_cgrp_id); + wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); + css_put(memcg_css); + } + } + + if (!wb) + wb = &bdi->wb; + + /* + * There may be multiple instances of this function racing to + * update the same inode. Use cmpxchg() to tell the winner. + */ + if (unlikely(cmpxchg(&inode->i_wb, NULL, wb))) + wb_put(wb); +} + +/** + * locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it + * @inode: inode of interest with i_lock held + * + * Returns @inode's wb with its list_lock held. @inode->i_lock must be + * held on entry and is released on return. The returned wb is guaranteed + * to stay @inode's associated wb until its list_lock is released. + */ +static struct bdi_writeback * +locked_inode_to_wb_and_lock_list(struct inode *inode) + __releases(&inode->i_lock) + __acquires(&wb->list_lock) +{ + while (true) { + struct bdi_writeback *wb = inode_to_wb(inode); + + /* + * inode_to_wb() association is protected by both + * @inode->i_lock and @wb->list_lock but list_lock nests + * outside i_lock. Drop i_lock and verify that the + * association hasn't changed after acquiring list_lock. + */ + wb_get(wb); + spin_unlock(&inode->i_lock); + spin_lock(&wb->list_lock); + wb_put(wb); /* not gonna deref it anymore */ + + /* i_wb may have changed inbetween, can't use inode_to_wb() */ + if (likely(wb == inode->i_wb)) + return wb; /* @inode already has ref */ + + spin_unlock(&wb->list_lock); + cpu_relax(); + spin_lock(&inode->i_lock); + } +} + +/** + * inode_to_wb_and_lock_list - determine an inode's wb and lock it + * @inode: inode of interest + * + * Same as locked_inode_to_wb_and_lock_list() but @inode->i_lock isn't held + * on entry. + */ +static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode) + __acquires(&wb->list_lock) +{ + spin_lock(&inode->i_lock); + return locked_inode_to_wb_and_lock_list(inode); +} + +struct inode_switch_wbs_context { + struct inode *inode; + struct bdi_writeback *new_wb; + + struct rcu_head rcu_head; + struct work_struct work; +}; + +static void inode_switch_wbs_work_fn(struct work_struct *work) +{ + struct inode_switch_wbs_context *isw = + container_of(work, struct inode_switch_wbs_context, work); + struct inode *inode = isw->inode; + struct address_space *mapping = inode->i_mapping; + struct bdi_writeback *old_wb = inode->i_wb; + struct bdi_writeback *new_wb = isw->new_wb; + struct radix_tree_iter iter; + bool switched = false; + void **slot; + + /* + * By the time control reaches here, RCU grace period has passed + * since I_WB_SWITCH assertion and all wb stat update transactions + * between unlocked_inode_to_wb_begin/end() are guaranteed to be + * synchronizing against mapping->tree_lock. + * + * Grabbing old_wb->list_lock, inode->i_lock and mapping->tree_lock + * gives us exclusion against all wb related operations on @inode + * including IO list manipulations and stat updates. + */ + if (old_wb < new_wb) { + spin_lock(&old_wb->list_lock); + spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING); + } else { + spin_lock(&new_wb->list_lock); + spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING); + } + spin_lock(&inode->i_lock); + spin_lock_irq(&mapping->tree_lock); + + /* + * Once I_FREEING is visible under i_lock, the eviction path owns + * the inode and we shouldn't modify ->i_wb_list. + */ + if (unlikely(inode->i_state & I_FREEING)) + goto skip_switch; + + /* + * Count and transfer stats. Note that PAGECACHE_TAG_DIRTY points + * to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to + * pages actually under underwriteback. + */ + radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0, + PAGECACHE_TAG_DIRTY) { + struct page *page = radix_tree_deref_slot_protected(slot, + &mapping->tree_lock); + if (likely(page) && PageDirty(page)) { + __dec_wb_stat(old_wb, WB_RECLAIMABLE); + __inc_wb_stat(new_wb, WB_RECLAIMABLE); + } + } + + radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0, + PAGECACHE_TAG_WRITEBACK) { + struct page *page = radix_tree_deref_slot_protected(slot, + &mapping->tree_lock); + if (likely(page)) { + WARN_ON_ONCE(!PageWriteback(page)); + __dec_wb_stat(old_wb, WB_WRITEBACK); + __inc_wb_stat(new_wb, WB_WRITEBACK); + } + } + + wb_get(new_wb); + + /* + * Transfer to @new_wb's IO list if necessary. The specific list + * @inode was on is ignored and the inode is put on ->b_dirty which + * is always correct including from ->b_dirty_time. The transfer + * preserves @inode->dirtied_when ordering. + */ + if (!list_empty(&inode->i_wb_list)) { + struct inode *pos; + + inode_wb_list_del_locked(inode, old_wb); + inode->i_wb = new_wb; + list_for_each_entry(pos, &new_wb->b_dirty, i_wb_list) + if (time_after_eq(inode->dirtied_when, + pos->dirtied_when)) + break; + inode_wb_list_move_locked(inode, new_wb, pos->i_wb_list.prev); + } else { + inode->i_wb = new_wb; + } + + /* ->i_wb_frn updates may race wbc_detach_inode() but doesn't matter */ + inode->i_wb_frn_winner = 0; + inode->i_wb_frn_avg_time = 0; + inode->i_wb_frn_history = 0; + switched = true; +skip_switch: + /* + * Paired with load_acquire in unlocked_inode_to_wb_begin() and + * ensures that the new wb is visible if they see !I_WB_SWITCH. + */ + smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH); + + spin_unlock_irq(&mapping->tree_lock); + spin_unlock(&inode->i_lock); + spin_unlock(&new_wb->list_lock); + spin_unlock(&old_wb->list_lock); + + if (switched) { + wb_wakeup(new_wb); + wb_put(old_wb); + } + wb_put(new_wb); + + iput(inode); + kfree(isw); +} + +static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head) +{ + struct inode_switch_wbs_context *isw = container_of(rcu_head, + struct inode_switch_wbs_context, rcu_head); + + /* needs to grab bh-unsafe locks, bounce to work item */ + INIT_WORK(&isw->work, inode_switch_wbs_work_fn); + schedule_work(&isw->work); +} + +/** + * inode_switch_wbs - change the wb association of an inode + * @inode: target inode + * @new_wb_id: ID of the new wb + * + * Switch @inode's wb association to the wb identified by @new_wb_id. The + * switching is performed asynchronously and may fail silently. + */ +static void inode_switch_wbs(struct inode *inode, int new_wb_id) +{ + struct backing_dev_info *bdi = inode_to_bdi(inode); + struct cgroup_subsys_state *memcg_css; + struct inode_switch_wbs_context *isw; + + /* noop if seems to be already in progress */ + if (inode->i_state & I_WB_SWITCH) + return; + + isw = kzalloc(sizeof(*isw), GFP_ATOMIC); + if (!isw) + return; + + /* find and pin the new wb */ + rcu_read_lock(); + memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys); + if (memcg_css) + isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); + rcu_read_unlock(); + if (!isw->new_wb) + goto out_free; + + /* while holding I_WB_SWITCH, no one else can update the association */ + spin_lock(&inode->i_lock); + if (inode->i_state & (I_WB_SWITCH | I_FREEING) || + inode_to_wb(inode) == isw->new_wb) { + spin_unlock(&inode->i_lock); + goto out_free; + } + inode->i_state |= I_WB_SWITCH; + spin_unlock(&inode->i_lock); + + ihold(inode); + isw->inode = inode; + + /* + * In addition to synchronizing among switchers, I_WB_SWITCH tells + * the RCU protected stat update paths to grab the mapping's + * tree_lock so that stat transfer can synchronize against them. + * Let's continue after I_WB_SWITCH is guaranteed to be visible. + */ + call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn); + return; + +out_free: + if (isw->new_wb) + wb_put(isw->new_wb); + kfree(isw); +} + +/** + * wbc_attach_and_unlock_inode - associate wbc with target inode and unlock it + * @wbc: writeback_control of interest + * @inode: target inode + * + * @inode is locked and about to be written back under the control of @wbc. + * Record @inode's writeback context into @wbc and unlock the i_lock. On + * writeback completion, wbc_detach_inode() should be called. This is used + * to track the cgroup writeback context. + */ +void wbc_attach_and_unlock_inode(struct writeback_control *wbc, + struct inode *inode) +{ + if (!inode_cgwb_enabled(inode)) { + spin_unlock(&inode->i_lock); + return; + } + + wbc->wb = inode_to_wb(inode); + wbc->inode = inode; + + wbc->wb_id = wbc->wb->memcg_css->id; + wbc->wb_lcand_id = inode->i_wb_frn_winner; + wbc->wb_tcand_id = 0; + wbc->wb_bytes = 0; + wbc->wb_lcand_bytes = 0; + wbc->wb_tcand_bytes = 0; + + wb_get(wbc->wb); + spin_unlock(&inode->i_lock); + + /* + * A dying wb indicates that the memcg-blkcg mapping has changed + * and a new wb is already serving the memcg. Switch immediately. + */ + if (unlikely(wb_dying(wbc->wb))) + inode_switch_wbs(inode, wbc->wb_id); +} + +/** + * wbc_detach_inode - disassociate wbc from inode and perform foreign detection + * @wbc: writeback_control of the just finished writeback + * + * To be called after a writeback attempt of an inode finishes and undoes + * wbc_attach_and_unlock_inode(). Can be called under any context. + * + * As concurrent write sharing of an inode is expected to be very rare and + * memcg only tracks page ownership on first-use basis severely confining + * the usefulness of such sharing, cgroup writeback tracks ownership + * per-inode. While the support for concurrent write sharing of an inode + * is deemed unnecessary, an inode being written to by different cgroups at + * different points in time is a lot more common, and, more importantly, + * charging only by first-use can too readily lead to grossly incorrect + * behaviors (single foreign page can lead to gigabytes of writeback to be + * incorrectly attributed). + * + * To resolve this issue, cgroup writeback detects the majority dirtier of + * an inode and transfers the ownership to it. To avoid unnnecessary + * oscillation, the detection mechanism keeps track of history and gives + * out the switch verdict only if the foreign usage pattern is stable over + * a certain amount of time and/or writeback attempts. + * + * On each writeback attempt, @wbc tries to detect the majority writer + * using Boyer-Moore majority vote algorithm. In addition to the byte + * count from the majority voting, it also counts the bytes written for the + * current wb and the last round's winner wb (max of last round's current + * wb, the winner from two rounds ago, and the last round's majority + * candidate). Keeping track of the historical winner helps the algorithm + * to semi-reliably detect the most active writer even when it's not the + * absolute majority. + * + * Once the winner of the round is determined, whether the winner is + * foreign or not and how much IO time the round consumed is recorded in + * inode->i_wb_frn_history. If the amount of recorded foreign IO time is + * over a certain threshold, the switch verdict is given. + */ +void wbc_detach_inode(struct writeback_control *wbc) +{ + struct bdi_writeback *wb = wbc->wb; + struct inode *inode = wbc->inode; + unsigned long avg_time, max_bytes, max_time; + u16 history; + int max_id; + + if (!wb) + return; + + history = inode->i_wb_frn_history; + avg_time = inode->i_wb_frn_avg_time; + + /* pick the winner of this round */ + if (wbc->wb_bytes >= wbc->wb_lcand_bytes && + wbc->wb_bytes >= wbc->wb_tcand_bytes) { + max_id = wbc->wb_id; + max_bytes = wbc->wb_bytes; + } else if (wbc->wb_lcand_bytes >= wbc->wb_tcand_bytes) { + max_id = wbc->wb_lcand_id; + max_bytes = wbc->wb_lcand_bytes; + } else { + max_id = wbc->wb_tcand_id; + max_bytes = wbc->wb_tcand_bytes; + } + + /* + * Calculate the amount of IO time the winner consumed and fold it + * into the running average kept per inode. If the consumed IO + * time is lower than avag / WB_FRN_TIME_CUT_DIV, ignore it for + * deciding whether to switch or not. This is to prevent one-off + * small dirtiers from skewing the verdict. + */ + max_time = DIV_ROUND_UP((max_bytes >> PAGE_SHIFT) << WB_FRN_TIME_SHIFT, + wb->avg_write_bandwidth); + if (avg_time) + avg_time += (max_time >> WB_FRN_TIME_AVG_SHIFT) - + (avg_time >> WB_FRN_TIME_AVG_SHIFT); + else + avg_time = max_time; /* immediate catch up on first run */ + + if (max_time >= avg_time / WB_FRN_TIME_CUT_DIV) { + int slots; + + /* + * The switch verdict is reached if foreign wb's consume + * more than a certain proportion of IO time in a + * WB_FRN_TIME_PERIOD. This is loosely tracked by 16 slot + * history mask where each bit represents one sixteenth of + * the period. Determine the number of slots to shift into + * history from @max_time. + */ + slots = min(DIV_ROUND_UP(max_time, WB_FRN_HIST_UNIT), + (unsigned long)WB_FRN_HIST_MAX_SLOTS); + history <<= slots; + if (wbc->wb_id != max_id) + history |= (1U << slots) - 1; + + /* + * Switch if the current wb isn't the consistent winner. + * If there are multiple closely competing dirtiers, the + * inode may switch across them repeatedly over time, which + * is okay. The main goal is avoiding keeping an inode on + * the wrong wb for an extended period of time. + */ + if (hweight32(history) > WB_FRN_HIST_THR_SLOTS) + inode_switch_wbs(inode, max_id); + } + + /* + * Multiple instances of this function may race to update the + * following fields but we don't mind occassional inaccuracies. + */ + inode->i_wb_frn_winner = max_id; + inode->i_wb_frn_avg_time = min(avg_time, (unsigned long)U16_MAX); + inode->i_wb_frn_history = history; + + wb_put(wbc->wb); + wbc->wb = NULL; +} + +/** + * wbc_account_io - account IO issued during writeback + * @wbc: writeback_control of the writeback in progress + * @page: page being written out + * @bytes: number of bytes being written out + * + * @bytes from @page are about to written out during the writeback + * controlled by @wbc. Keep the book for foreign inode detection. See + * wbc_detach_inode(). + */ +void wbc_account_io(struct writeback_control *wbc, struct page *page, + size_t bytes) +{ + int id; + + /* + * pageout() path doesn't attach @wbc to the inode being written + * out. This is intentional as we don't want the function to block + * behind a slow cgroup. Ultimately, we want pageout() to kick off + * regular writeback instead of writing things out itself. + */ + if (!wbc->wb) + return; + + rcu_read_lock(); + id = mem_cgroup_css_from_page(page)->id; + rcu_read_unlock(); + + if (id == wbc->wb_id) { + wbc->wb_bytes += bytes; + return; + } + + if (id == wbc->wb_lcand_id) + wbc->wb_lcand_bytes += bytes; + + /* Boyer-Moore majority vote algorithm */ + if (!wbc->wb_tcand_bytes) + wbc->wb_tcand_id = id; + if (id == wbc->wb_tcand_id) + wbc->wb_tcand_bytes += bytes; + else + wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes); } -static void -__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, - bool range_cyclic, enum wb_reason reason) +/** + * inode_congested - test whether an inode is congested + * @inode: inode to test for congestion + * @cong_bits: mask of WB_[a]sync_congested bits to test + * + * Tests whether @inode is congested. @cong_bits is the mask of congestion + * bits to test and the return value is the mask of set bits. + * + * If cgroup writeback is enabled for @inode, the congestion state is + * determined by whether the cgwb (cgroup bdi_writeback) for the blkcg + * associated with @inode is congested; otherwise, the root wb's congestion + * state is used. + */ +int inode_congested(struct inode *inode, int cong_bits) +{ + /* + * Once set, ->i_wb never becomes NULL while the inode is alive. + * Start transaction iff ->i_wb is visible. + */ + if (inode && inode_to_wb_is_valid(inode)) { + struct bdi_writeback *wb; + bool locked, congested; + + wb = unlocked_inode_to_wb_begin(inode, &locked); + congested = wb_congested(wb, cong_bits); + unlocked_inode_to_wb_end(inode, locked); + return congested; + } + + return wb_congested(&inode_to_bdi(inode)->wb, cong_bits); +} +EXPORT_SYMBOL_GPL(inode_congested); + +/** + * wb_wait_for_single_work - wait for completion of a single bdi_writeback_work + * @bdi: bdi the work item was issued to + * @work: work item to wait for + * + * Wait for the completion of @work which was issued to one of @bdi's + * bdi_writeback's. The caller must have set @work->single_wait before + * issuing it. This wait operates independently fo + * wb_wait_for_completion() and also disables automatic freeing of @work. + */ +static void wb_wait_for_single_work(struct backing_dev_info *bdi, + struct wb_writeback_work *work) +{ + if (WARN_ON_ONCE(!work->single_wait)) + return; + + wait_event(bdi->wb_waitq, work->single_done); + + /* + * Paired with smp_wmb() in wb_do_writeback() and ensures that all + * modifications to @work prior to assertion of ->single_done is + * visible to the caller once this function returns. + */ + smp_rmb(); +} + +/** + * wb_split_bdi_pages - split nr_pages to write according to bandwidth + * @wb: target bdi_writeback to split @nr_pages to + * @nr_pages: number of pages to write for the whole bdi + * + * Split @wb's portion of @nr_pages according to @wb's write bandwidth in + * relation to the total write bandwidth of all wb's w/ dirty inodes on + * @wb->bdi. + */ +static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages) +{ + unsigned long this_bw = wb->avg_write_bandwidth; + unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth); + + if (nr_pages == LONG_MAX) + return LONG_MAX; + + /* + * This may be called on clean wb's and proportional distribution + * may not make sense, just use the original @nr_pages in those + * cases. In general, we wanna err on the side of writing more. + */ + if (!tot_bw || this_bw >= tot_bw) + return nr_pages; + else + return DIV_ROUND_UP_ULL((u64)nr_pages * this_bw, tot_bw); +} + +/** + * wb_clone_and_queue_work - clone a wb_writeback_work and issue it to a wb + * @wb: target bdi_writeback + * @base_work: source wb_writeback_work + * + * Try to make a clone of @base_work and issue it to @wb. If cloning + * succeeds, %true is returned; otherwise, @base_work is issued directly + * and %false is returned. In the latter case, the caller is required to + * wait for @base_work's completion using wb_wait_for_single_work(). + * + * A clone is auto-freed on completion. @base_work never is. + */ +static bool wb_clone_and_queue_work(struct bdi_writeback *wb, + struct wb_writeback_work *base_work) { struct wb_writeback_work *work; + work = kmalloc(sizeof(*work), GFP_ATOMIC); + if (work) { + *work = *base_work; + work->auto_free = 1; + work->single_wait = 0; + } else { + work = base_work; + work->auto_free = 0; + work->single_wait = 1; + } + work->single_done = 0; + wb_queue_work(wb, work); + return work != base_work; +} + +/** + * bdi_split_work_to_wbs - split a wb_writeback_work to all wb's of a bdi + * @bdi: target backing_dev_info + * @base_work: wb_writeback_work to issue + * @skip_if_busy: skip wb's which already have writeback in progress + * + * Split and issue @base_work to all wb's (bdi_writeback's) of @bdi which + * have dirty inodes. If @base_work->nr_page isn't %LONG_MAX, it's + * distributed to the busy wbs according to each wb's proportion in the + * total active write bandwidth of @bdi. + */ +static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, + struct wb_writeback_work *base_work, + bool skip_if_busy) +{ + long nr_pages = base_work->nr_pages; + int next_blkcg_id = 0; + struct bdi_writeback *wb; + struct wb_iter iter; + + might_sleep(); + + if (!bdi_has_dirty_io(bdi)) + return; +restart: + rcu_read_lock(); + bdi_for_each_wb(wb, bdi, &iter, next_blkcg_id) { + if (!wb_has_dirty_io(wb) || + (skip_if_busy && writeback_in_progress(wb))) + continue; + + base_work->nr_pages = wb_split_bdi_pages(wb, nr_pages); + if (!wb_clone_and_queue_work(wb, base_work)) { + next_blkcg_id = wb->blkcg_css->id + 1; + rcu_read_unlock(); + wb_wait_for_single_work(bdi, base_work); + goto restart; + } + } + rcu_read_unlock(); +} + +#else /* CONFIG_CGROUP_WRITEBACK */ + +static struct bdi_writeback * +locked_inode_to_wb_and_lock_list(struct inode *inode) + __releases(&inode->i_lock) + __acquires(&wb->list_lock) +{ + struct bdi_writeback *wb = inode_to_wb(inode); + + spin_unlock(&inode->i_lock); + spin_lock(&wb->list_lock); + return wb; +} + +static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode) + __acquires(&wb->list_lock) +{ + struct bdi_writeback *wb = inode_to_wb(inode); + + spin_lock(&wb->list_lock); + return wb; +} + +static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages) +{ + return nr_pages; +} + +static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, + struct wb_writeback_work *base_work, + bool skip_if_busy) +{ + might_sleep(); + + if (bdi_has_dirty_io(bdi) && + (!skip_if_busy || !writeback_in_progress(&bdi->wb))) { + base_work->auto_free = 0; + base_work->single_wait = 0; + base_work->single_done = 0; + wb_queue_work(&bdi->wb, base_work); + } +} + +#endif /* CONFIG_CGROUP_WRITEBACK */ + +void wb_start_writeback(struct bdi_writeback *wb, long nr_pages, + bool range_cyclic, enum wb_reason reason) +{ + struct wb_writeback_work *work; + + if (!wb_has_dirty_io(wb)) + return; + /* * This is WB_SYNC_NONE writeback, so if allocation fails just * wakeup the thread for old dirty data writeback */ work = kzalloc(sizeof(*work), GFP_ATOMIC); if (!work) { - trace_writeback_nowork(bdi); - bdi_wakeup_thread(bdi); + trace_writeback_nowork(wb->bdi); + wb_wakeup(wb); return; } @@ -155,46 +932,29 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, work->nr_pages = nr_pages; work->range_cyclic = range_cyclic; work->reason = reason; + work->auto_free = 1; - bdi_queue_work(bdi, work); + wb_queue_work(wb, work); } /** - * bdi_start_writeback - start writeback - * @bdi: the backing device to write from - * @nr_pages: the number of pages to write - * @reason: reason why some writeback work was initiated - * - * Description: - * This does WB_SYNC_NONE opportunistic writeback. The IO is only - * started when this function returns, we make no guarantees on - * completion. Caller need not hold sb s_umount semaphore. - * - */ -void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, - enum wb_reason reason) -{ - __bdi_start_writeback(bdi, nr_pages, true, reason); -} - -/** - * bdi_start_background_writeback - start background writeback - * @bdi: the backing device to write from + * wb_start_background_writeback - start background writeback + * @wb: bdi_writback to write from * * Description: * This makes sure WB_SYNC_NONE background writeback happens. When - * this function returns, it is only guaranteed that for given BDI + * this function returns, it is only guaranteed that for given wb * some IO is happening if we are over background dirty threshold. * Caller need not hold sb s_umount semaphore. */ -void bdi_start_background_writeback(struct backing_dev_info *bdi) +void wb_start_background_writeback(struct bdi_writeback *wb) { /* * We just wake up the flusher thread. It will perform background * writeback as soon as there is no other work to do. */ - trace_writeback_wake_background(bdi); - bdi_wakeup_thread(bdi); + trace_writeback_wake_background(wb->bdi); + wb_wakeup(wb); } /* @@ -202,11 +962,11 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi) */ void inode_wb_list_del(struct inode *inode) { - struct backing_dev_info *bdi = inode_to_bdi(inode); + struct bdi_writeback *wb; - spin_lock(&bdi->wb.list_lock); - list_del_init(&inode->i_wb_list); - spin_unlock(&bdi->wb.list_lock); + wb = inode_to_wb_and_lock_list(inode); + inode_wb_list_del_locked(inode, wb); + spin_unlock(&wb->list_lock); } /* @@ -220,7 +980,6 @@ void inode_wb_list_del(struct inode *inode) */ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) { - assert_spin_locked(&wb->list_lock); if (!list_empty(&wb->b_dirty)) { struct inode *tail; @@ -228,7 +987,7 @@ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) if (time_before(inode->dirtied_when, tail->dirtied_when)) inode->dirtied_when = jiffies; } - list_move(&inode->i_wb_list, &wb->b_dirty); + inode_wb_list_move_locked(inode, wb, &wb->b_dirty); } /* @@ -236,8 +995,7 @@ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) */ static void requeue_io(struct inode *inode, struct bdi_writeback *wb) { - assert_spin_locked(&wb->list_lock); - list_move(&inode->i_wb_list, &wb->b_more_io); + inode_wb_list_move_locked(inode, wb, &wb->b_more_io); } static void inode_sync_complete(struct inode *inode) @@ -346,6 +1104,8 @@ static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work) moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work); moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io, EXPIRE_DIRTY_ATIME, work); + if (moved) + wb_io_lists_populated(wb); trace_writeback_queue_io(wb, work, moved); } @@ -471,10 +1231,10 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, redirty_tail(inode, wb); } else if (inode->i_state & I_DIRTY_TIME) { inode->dirtied_when = jiffies; - list_move(&inode->i_wb_list, &wb->b_dirty_time); + inode_wb_list_move_locked(inode, wb, &wb->b_dirty_time); } else { /* The inode is clean. Remove from writeback lists. */ - list_del_init(&inode->i_wb_list); + inode_wb_list_del_locked(inode, wb); } } @@ -605,10 +1365,11 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK))) goto out; inode->i_state |= I_SYNC; - spin_unlock(&inode->i_lock); + wbc_attach_and_unlock_inode(wbc, inode); ret = __writeback_single_inode(inode, wbc); + wbc_detach_inode(wbc); spin_lock(&wb->list_lock); spin_lock(&inode->i_lock); /* @@ -616,7 +1377,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, * touch it. See comment above for explanation. */ if (!(inode->i_state & I_DIRTY_ALL)) - list_del_init(&inode->i_wb_list); + inode_wb_list_del_locked(inode, wb); spin_unlock(&wb->list_lock); inode_sync_complete(inode); out: @@ -624,7 +1385,7 @@ out: return ret; } -static long writeback_chunk_size(struct backing_dev_info *bdi, +static long writeback_chunk_size(struct bdi_writeback *wb, struct wb_writeback_work *work) { long pages; @@ -645,8 +1406,8 @@ static long writeback_chunk_size(struct backing_dev_info *bdi, if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages) pages = LONG_MAX; else { - pages = min(bdi->avg_write_bandwidth / 2, - global_dirty_limit / DIRTY_SCOPE); + pages = min(wb->avg_write_bandwidth / 2, + global_wb_domain.dirty_limit / DIRTY_SCOPE); pages = min(pages, work->nr_pages); pages = round_down(pages + MIN_WRITEBACK_PAGES, MIN_WRITEBACK_PAGES); @@ -741,9 +1502,9 @@ static long writeback_sb_inodes(struct super_block *sb, continue; } inode->i_state |= I_SYNC; - spin_unlock(&inode->i_lock); + wbc_attach_and_unlock_inode(&wbc, inode); - write_chunk = writeback_chunk_size(wb->bdi, work); + write_chunk = writeback_chunk_size(wb, work); wbc.nr_to_write = write_chunk; wbc.pages_skipped = 0; @@ -753,6 +1514,7 @@ static long writeback_sb_inodes(struct super_block *sb, */ __writeback_single_inode(inode, &wbc); + wbc_detach_inode(&wbc); work->nr_pages -= write_chunk - wbc.nr_to_write; wrote += write_chunk - wbc.nr_to_write; spin_lock(&wb->list_lock); @@ -830,33 +1592,6 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, return nr_pages - work.nr_pages; } -static bool over_bground_thresh(struct backing_dev_info *bdi) -{ - unsigned long background_thresh, dirty_thresh; - - global_dirty_limits(&background_thresh, &dirty_thresh); - - if (global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS) > background_thresh) - return true; - - if (bdi_stat(bdi, BDI_RECLAIMABLE) > - bdi_dirty_limit(bdi, background_thresh)) - return true; - - return false; -} - -/* - * Called under wb->list_lock. If there are multiple wb per bdi, - * only the flusher working on the first wb should do it. - */ -static void wb_update_bandwidth(struct bdi_writeback *wb, - unsigned long start_time) -{ - __bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, 0, start_time); -} - /* * Explicit flushing or periodic writeback of "old" data. * @@ -899,14 +1634,14 @@ static long wb_writeback(struct bdi_writeback *wb, * after the other works are all done. */ if ((work->for_background || work->for_kupdate) && - !list_empty(&wb->bdi->work_list)) + !list_empty(&wb->work_list)) break; /* * For background writeout, stop when we are below the * background dirty threshold */ - if (work->for_background && !over_bground_thresh(wb->bdi)) + if (work->for_background && !wb_over_bg_thresh(wb)) break; /* @@ -970,18 +1705,17 @@ static long wb_writeback(struct bdi_writeback *wb, /* * Return the next wb_writeback_work struct that hasn't been processed yet. */ -static struct wb_writeback_work * -get_next_work_item(struct backing_dev_info *bdi) +static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb) { struct wb_writeback_work *work = NULL; - spin_lock_bh(&bdi->wb_lock); - if (!list_empty(&bdi->work_list)) { - work = list_entry(bdi->work_list.next, + spin_lock_bh(&wb->work_lock); + if (!list_empty(&wb->work_list)) { + work = list_entry(wb->work_list.next, struct wb_writeback_work, list); list_del_init(&work->list); } - spin_unlock_bh(&bdi->wb_lock); + spin_unlock_bh(&wb->work_lock); return work; } @@ -998,7 +1732,7 @@ static unsigned long get_nr_dirty_pages(void) static long wb_check_background_flush(struct bdi_writeback *wb) { - if (over_bground_thresh(wb->bdi)) { + if (wb_over_bg_thresh(wb)) { struct wb_writeback_work work = { .nr_pages = LONG_MAX, @@ -1053,25 +1787,33 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb) */ static long wb_do_writeback(struct bdi_writeback *wb) { - struct backing_dev_info *bdi = wb->bdi; struct wb_writeback_work *work; long wrote = 0; - set_bit(BDI_writeback_running, &wb->bdi->state); - while ((work = get_next_work_item(bdi)) != NULL) { + set_bit(WB_writeback_running, &wb->state); + while ((work = get_next_work_item(wb)) != NULL) { + struct wb_completion *done = work->done; + bool need_wake_up = false; - trace_writeback_exec(bdi, work); + trace_writeback_exec(wb->bdi, work); wrote += wb_writeback(wb, work); - /* - * Notify the caller of completion if this is a synchronous - * work item, otherwise just free it. - */ - if (work->done) - complete(work->done); - else + if (work->single_wait) { + WARN_ON_ONCE(work->auto_free); + /* paired w/ rmb in wb_wait_for_single_work() */ + smp_wmb(); + work->single_done = 1; + need_wake_up = true; + } else if (work->auto_free) { kfree(work); + } + + if (done && atomic_dec_and_test(&done->cnt)) + need_wake_up = true; + + if (need_wake_up) + wake_up_all(&wb->bdi->wb_waitq); } /* @@ -1079,7 +1821,7 @@ static long wb_do_writeback(struct bdi_writeback *wb) */ wrote += wb_check_old_data_flush(wb); wrote += wb_check_background_flush(wb); - clear_bit(BDI_writeback_running, &wb->bdi->state); + clear_bit(WB_writeback_running, &wb->state); return wrote; } @@ -1088,43 +1830,42 @@ static long wb_do_writeback(struct bdi_writeback *wb) * Handle writeback of dirty data for the device backed by this bdi. Also * reschedules periodically and does kupdated style flushing. */ -void bdi_writeback_workfn(struct work_struct *work) +void wb_workfn(struct work_struct *work) { struct bdi_writeback *wb = container_of(to_delayed_work(work), struct bdi_writeback, dwork); - struct backing_dev_info *bdi = wb->bdi; long pages_written; - set_worker_desc("flush-%s", dev_name(bdi->dev)); + set_worker_desc("flush-%s", dev_name(wb->bdi->dev)); current->flags |= PF_SWAPWRITE; if (likely(!current_is_workqueue_rescuer() || - !test_bit(BDI_registered, &bdi->state))) { + !test_bit(WB_registered, &wb->state))) { /* - * The normal path. Keep writing back @bdi until its + * The normal path. Keep writing back @wb until its * work_list is empty. Note that this path is also taken - * if @bdi is shutting down even when we're running off the + * if @wb is shutting down even when we're running off the * rescuer as work_list needs to be drained. */ do { pages_written = wb_do_writeback(wb); trace_writeback_pages_written(pages_written); - } while (!list_empty(&bdi->work_list)); + } while (!list_empty(&wb->work_list)); } else { /* * bdi_wq can't get enough workers and we're running off * the emergency worker. Don't hog it. Hopefully, 1024 is * enough for efficient IO. */ - pages_written = writeback_inodes_wb(&bdi->wb, 1024, + pages_written = writeback_inodes_wb(wb, 1024, WB_REASON_FORKER_THREAD); trace_writeback_pages_written(pages_written); } - if (!list_empty(&bdi->work_list)) + if (!list_empty(&wb->work_list)) mod_delayed_work(bdi_wq, &wb->dwork, 0); else if (wb_has_dirty_io(wb) && dirty_writeback_interval) - bdi_wakeup_thread_delayed(bdi); + wb_wakeup_delayed(wb); current->flags &= ~PF_SWAPWRITE; } @@ -1142,9 +1883,15 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason) rcu_read_lock(); list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { + struct bdi_writeback *wb; + struct wb_iter iter; + if (!bdi_has_dirty_io(bdi)) continue; - __bdi_start_writeback(bdi, nr_pages, false, reason); + + bdi_for_each_wb(wb, bdi, &iter, 0) + wb_start_writeback(wb, wb_split_bdi_pages(wb, nr_pages), + false, reason); } rcu_read_unlock(); } @@ -1173,9 +1920,12 @@ static void wakeup_dirtytime_writeback(struct work_struct *w) rcu_read_lock(); list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { - if (list_empty(&bdi->wb.b_dirty_time)) - continue; - bdi_wakeup_thread(bdi); + struct bdi_writeback *wb; + struct wb_iter iter; + + bdi_for_each_wb(wb, bdi, &iter, 0) + if (!list_empty(&bdi->wb.b_dirty_time)) + wb_wakeup(&bdi->wb); } rcu_read_unlock(); schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ); @@ -1249,7 +1999,6 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode) void __mark_inode_dirty(struct inode *inode, int flags) { struct super_block *sb = inode->i_sb; - struct backing_dev_info *bdi = NULL; int dirtytime; trace_writeback_mark_inode_dirty(inode, flags); @@ -1289,6 +2038,8 @@ void __mark_inode_dirty(struct inode *inode, int flags) if ((inode->i_state & flags) != flags) { const int was_dirty = inode->i_state & I_DIRTY; + inode_attach_wb(inode, NULL); + if (flags & I_DIRTY_INODE) inode->i_state &= ~I_DIRTY_TIME; inode->i_state |= flags; @@ -1317,38 +2068,39 @@ void __mark_inode_dirty(struct inode *inode, int flags) * reposition it (that would break b_dirty time-ordering). */ if (!was_dirty) { + struct bdi_writeback *wb; + struct list_head *dirty_list; bool wakeup_bdi = false; - bdi = inode_to_bdi(inode); - spin_unlock(&inode->i_lock); - spin_lock(&bdi->wb.list_lock); - if (bdi_cap_writeback_dirty(bdi)) { - WARN(!test_bit(BDI_registered, &bdi->state), - "bdi-%s not registered\n", bdi->name); + wb = locked_inode_to_wb_and_lock_list(inode); - /* - * If this is the first dirty inode for this - * bdi, we have to wake-up the corresponding - * bdi thread to make sure background - * write-back happens later. - */ - if (!wb_has_dirty_io(&bdi->wb)) - wakeup_bdi = true; - } + WARN(bdi_cap_writeback_dirty(wb->bdi) && + !test_bit(WB_registered, &wb->state), + "bdi-%s not registered\n", wb->bdi->name); inode->dirtied_when = jiffies; if (dirtytime) inode->dirtied_time_when = jiffies; + if (inode->i_state & (I_DIRTY_INODE | I_DIRTY_PAGES)) - list_move(&inode->i_wb_list, &bdi->wb.b_dirty); + dirty_list = &wb->b_dirty; else - list_move(&inode->i_wb_list, - &bdi->wb.b_dirty_time); - spin_unlock(&bdi->wb.list_lock); + dirty_list = &wb->b_dirty_time; + + wakeup_bdi = inode_wb_list_move_locked(inode, wb, + dirty_list); + + spin_unlock(&wb->list_lock); trace_writeback_dirty_inode_enqueue(inode); - if (wakeup_bdi) - bdi_wakeup_thread_delayed(bdi); + /* + * If this is the first dirty inode for this bdi, + * we have to wake-up the corresponding bdi thread + * to make sure background write-back happens + * later. + */ + if (bdi_cap_writeback_dirty(wb->bdi) && wakeup_bdi) + wb_wakeup_delayed(wb); return; } } @@ -1411,6 +2163,28 @@ static void wait_sb_inodes(struct super_block *sb) iput(old_inode); } +static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr, + enum wb_reason reason, bool skip_if_busy) +{ + DEFINE_WB_COMPLETION_ONSTACK(done); + struct wb_writeback_work work = { + .sb = sb, + .sync_mode = WB_SYNC_NONE, + .tagged_writepages = 1, + .done = &done, + .nr_pages = nr, + .reason = reason, + }; + struct backing_dev_info *bdi = sb->s_bdi; + + if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info) + return; + WARN_ON(!rwsem_is_locked(&sb->s_umount)); + + bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy); + wb_wait_for_completion(bdi, &done); +} + /** * writeback_inodes_sb_nr - writeback dirty inodes from given super_block * @sb: the superblock @@ -1425,21 +2199,7 @@ void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr, enum wb_reason reason) { - DECLARE_COMPLETION_ONSTACK(done); - struct wb_writeback_work work = { - .sb = sb, - .sync_mode = WB_SYNC_NONE, - .tagged_writepages = 1, - .done = &done, - .nr_pages = nr, - .reason = reason, - }; - - if (sb->s_bdi == &noop_backing_dev_info) - return; - WARN_ON(!rwsem_is_locked(&sb->s_umount)); - bdi_queue_work(sb->s_bdi, &work); - wait_for_completion(&done); + __writeback_inodes_sb_nr(sb, nr, reason, false); } EXPORT_SYMBOL(writeback_inodes_sb_nr); @@ -1467,19 +2227,15 @@ EXPORT_SYMBOL(writeback_inodes_sb); * Invoke writeback_inodes_sb_nr if no writeback is currently underway. * Returns 1 if writeback was started, 0 if not. */ -int try_to_writeback_inodes_sb_nr(struct super_block *sb, - unsigned long nr, - enum wb_reason reason) +bool try_to_writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr, + enum wb_reason reason) { - if (writeback_in_progress(sb->s_bdi)) - return 1; - if (!down_read_trylock(&sb->s_umount)) - return 0; + return false; - writeback_inodes_sb_nr(sb, nr, reason); + __writeback_inodes_sb_nr(sb, nr, reason, true); up_read(&sb->s_umount); - return 1; + return true; } EXPORT_SYMBOL(try_to_writeback_inodes_sb_nr); @@ -1491,7 +2247,7 @@ EXPORT_SYMBOL(try_to_writeback_inodes_sb_nr); * Implement by try_to_writeback_inodes_sb_nr() * Returns 1 if writeback was started, 0 if not. */ -int try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason) +bool try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason) { return try_to_writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason); } @@ -1506,7 +2262,7 @@ EXPORT_SYMBOL(try_to_writeback_inodes_sb); */ void sync_inodes_sb(struct super_block *sb) { - DECLARE_COMPLETION_ONSTACK(done); + DEFINE_WB_COMPLETION_ONSTACK(done); struct wb_writeback_work work = { .sb = sb, .sync_mode = WB_SYNC_ALL, @@ -1516,14 +2272,15 @@ void sync_inodes_sb(struct super_block *sb) .reason = WB_REASON_SYNC, .for_sync = 1, }; + struct backing_dev_info *bdi = sb->s_bdi; /* Nothing to do? */ - if (sb->s_bdi == &noop_backing_dev_info) + if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info) return; WARN_ON(!rwsem_is_locked(&sb->s_umount)); - bdi_queue_work(sb->s_bdi, &work); - wait_for_completion(&done); + bdi_split_work_to_wbs(bdi, &work, false); + wb_wait_for_completion(bdi, &done); wait_sb_inodes(sb); } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 5ef05b5..8c5e2fa 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1445,9 +1445,9 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) list_del(&req->writepages_entry); for (i = 0; i < req->num_pages; i++) { - dec_bdi_stat(bdi, BDI_WRITEBACK); + dec_wb_stat(&bdi->wb, WB_WRITEBACK); dec_zone_page_state(req->pages[i], NR_WRITEBACK_TEMP); - bdi_writeout_inc(bdi); + wb_writeout_inc(&bdi->wb); } wake_up(&fi->page_waitq); } @@ -1634,7 +1634,7 @@ static int fuse_writepage_locked(struct page *page) req->end = fuse_writepage_end; req->inode = inode; - inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK); + inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); spin_lock(&fc->lock); @@ -1749,9 +1749,9 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req, copy_highpage(old_req->pages[0], page); spin_unlock(&fc->lock); - dec_bdi_stat(bdi, BDI_WRITEBACK); + dec_wb_stat(&bdi->wb, WB_WRITEBACK); dec_zone_page_state(page, NR_WRITEBACK_TEMP); - bdi_writeout_inc(bdi); + wb_writeout_inc(&bdi->wb); fuse_writepage_free(fc, new_req); fuse_request_free(new_req); goto out; @@ -1848,7 +1848,7 @@ static int fuse_writepages_fill(struct page *page, req->page_descs[req->num_pages].offset = 0; req->page_descs[req->num_pages].length = PAGE_SIZE; - inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK); + inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); err = 0; diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 859c6ed..2982445 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -748,7 +748,7 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc) if (wbc->sync_mode == WB_SYNC_ALL) gfs2_log_flush(GFS2_SB(inode), ip->i_gl, NORMAL_FLUSH); - if (bdi->dirty_exceeded) + if (bdi->wb.dirty_exceeded) gfs2_ail1_flush(sdp, wbc); else filemap_fdatawrite(metamapping); diff --git a/fs/hfs/super.c b/fs/hfs/super.c index eee7206..55c03b9 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -14,6 +14,7 @@ #include <linux/module.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <linux/mount.h> #include <linux/init.h> #include <linux/nls.h> diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 593af2f..7302d96 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -11,6 +11,7 @@ #include <linux/init.h> #include <linux/pagemap.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <linux/fs.h> #include <linux/slab.h> #include <linux/vfs.h> @@ -224,6 +224,7 @@ EXPORT_SYMBOL(free_inode_nonrcu); void __destroy_inode(struct inode *inode) { BUG_ON(inode_has_buffers(inode)); + inode_detach_wb(inode); security_inode_free(inode); fsnotify_inode_delete(inode); locks_free_lock_context(inode->i_flctx); diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 988b32e..4227dc4 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -390,7 +390,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal) unsigned long blocknr; if (is_journal_aborted(journal)) - return 1; + return -EIO; if (!jbd2_journal_get_log_tail(journal, &first_tid, &blocknr)) return 1; @@ -405,10 +405,9 @@ int jbd2_cleanup_journal_tail(journal_t *journal) * jbd2_cleanup_journal_tail() doesn't get called all that often. */ if (journal->j_flags & JBD2_BARRIER) - blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); + blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL); - __jbd2_update_log_tail(journal, first_tid, blocknr); - return 0; + return __jbd2_update_log_tail(journal, first_tid, blocknr); } diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 5c187de..4ff3fad 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -371,16 +371,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, */ J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in)); -retry_alloc: - new_bh = alloc_buffer_head(GFP_NOFS); - if (!new_bh) { - /* - * Failure is not an option, but __GFP_NOFAIL is going - * away; so we retry ourselves here. - */ - congestion_wait(BLK_RW_ASYNC, HZ/50); - goto retry_alloc; - } + new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL); /* keep subsequent assertions sane */ atomic_set(&new_bh->b_count, 1); @@ -885,9 +876,10 @@ int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid, * * Requires j_checkpoint_mutex */ -void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) +int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) { unsigned long freed; + int ret; BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); @@ -897,7 +889,10 @@ void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) * space and if we lose sb update during power failure we'd replay * old transaction with possibly newly overwritten data. */ - jbd2_journal_update_sb_log_tail(journal, tid, block, WRITE_FUA); + ret = jbd2_journal_update_sb_log_tail(journal, tid, block, WRITE_FUA); + if (ret) + goto out; + write_lock(&journal->j_state_lock); freed = block - journal->j_tail; if (block < journal->j_tail) @@ -913,6 +908,9 @@ void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) journal->j_tail_sequence = tid; journal->j_tail = block; write_unlock(&journal->j_state_lock); + +out: + return ret; } /* @@ -1325,7 +1323,7 @@ static int journal_reset(journal_t *journal) return jbd2_journal_start_thread(journal); } -static void jbd2_write_superblock(journal_t *journal, int write_op) +static int jbd2_write_superblock(journal_t *journal, int write_op) { struct buffer_head *bh = journal->j_sb_buffer; journal_superblock_t *sb = journal->j_superblock; @@ -1364,7 +1362,10 @@ static void jbd2_write_superblock(journal_t *journal, int write_op) printk(KERN_ERR "JBD2: Error %d detected when updating " "journal superblock for %s.\n", ret, journal->j_devname); + jbd2_journal_abort(journal, ret); } + + return ret; } /** @@ -1377,10 +1378,11 @@ static void jbd2_write_superblock(journal_t *journal, int write_op) * Update a journal's superblock information about log tail and write it to * disk, waiting for the IO to complete. */ -void jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, +int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, unsigned long tail_block, int write_op) { journal_superblock_t *sb = journal->j_superblock; + int ret; BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n", @@ -1389,13 +1391,18 @@ void jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, sb->s_sequence = cpu_to_be32(tail_tid); sb->s_start = cpu_to_be32(tail_block); - jbd2_write_superblock(journal, write_op); + ret = jbd2_write_superblock(journal, write_op); + if (ret) + goto out; /* Log is no longer empty */ write_lock(&journal->j_state_lock); WARN_ON(!sb->s_sequence); journal->j_flags &= ~JBD2_FLUSHED; write_unlock(&journal->j_state_lock); + +out: + return ret; } /** @@ -1944,7 +1951,14 @@ int jbd2_journal_flush(journal_t *journal) return -EIO; mutex_lock(&journal->j_checkpoint_mutex); - jbd2_cleanup_journal_tail(journal); + if (!err) { + err = jbd2_cleanup_journal_tail(journal); + if (err < 0) { + mutex_unlock(&journal->j_checkpoint_mutex); + goto out; + } + err = 0; + } /* Finally, mark the journal as really needing no recovery. * This sets s_start==0 in the underlying superblock, which is @@ -1960,7 +1974,8 @@ int jbd2_journal_flush(journal_t *journal) J_ASSERT(journal->j_head == journal->j_tail); J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence); write_unlock(&journal->j_state_lock); - return 0; +out: + return err; } /** @@ -2324,7 +2339,7 @@ static int jbd2_journal_init_journal_head_cache(void) jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head", sizeof(struct journal_head), 0, /* offset */ - SLAB_TEMPORARY, /* flags */ + SLAB_TEMPORARY | SLAB_DESTROY_BY_RCU, NULL); /* ctor */ retval = 0; if (!jbd2_journal_head_cache) { @@ -2356,10 +2371,8 @@ static struct journal_head *journal_alloc_journal_head(void) if (!ret) { jbd_debug(1, "out of memory for journal_head\n"); pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__); - while (!ret) { - yield(); - ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS); - } + ret = kmem_cache_zalloc(jbd2_journal_head_cache, + GFP_NOFS | __GFP_NOFAIL); } return ret; } diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index 14214da..0abf2e7f7 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -141,11 +141,13 @@ static int insert_revoke_hash(journal_t *journal, unsigned long long blocknr, { struct list_head *hash_list; struct jbd2_revoke_record_s *record; + gfp_t gfp_mask = GFP_NOFS; -repeat: - record = kmem_cache_alloc(jbd2_revoke_record_cache, GFP_NOFS); + if (journal_oom_retry) + gfp_mask |= __GFP_NOFAIL; + record = kmem_cache_alloc(jbd2_revoke_record_cache, gfp_mask); if (!record) - goto oom; + return -ENOMEM; record->sequence = seq; record->blocknr = blocknr; @@ -154,13 +156,6 @@ repeat: list_add(&record->hash, hash_list); spin_unlock(&journal->j_revoke_lock); return 0; - -oom: - if (!journal_oom_retry) - return -ENOMEM; - jbd_debug(1, "ENOMEM in %s, retrying\n", __func__); - yield(); - goto repeat; } /* Find a revoke record in the journal's hash table. */ diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index ff2f2e6..cbe8b3a 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -278,22 +278,16 @@ static int start_this_handle(journal_t *journal, handle_t *handle, alloc_transaction: if (!journal->j_running_transaction) { + /* + * If __GFP_FS is not present, then we may be being called from + * inside the fs writeback layer, so we MUST NOT fail. + */ + if ((gfp_mask & __GFP_FS) == 0) + gfp_mask |= __GFP_NOFAIL; new_transaction = kmem_cache_zalloc(transaction_cache, gfp_mask); - if (!new_transaction) { - /* - * If __GFP_FS is not present, then we may be - * being called from inside the fs writeback - * layer, so we MUST NOT fail. Since - * __GFP_NOFAIL is going away, we will arrange - * to retry the allocation ourselves. - */ - if ((gfp_mask & __GFP_FS) == 0) { - congestion_wait(BLK_RW_ASYNC, HZ/50); - goto alloc_transaction; - } + if (!new_transaction) return -ENOMEM; - } } jbd_debug(3, "New handle %p going live.\n", handle); @@ -761,6 +755,30 @@ static void warn_dirty_buffer(struct buffer_head *bh) bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); } +/* Call t_frozen trigger and copy buffer data into jh->b_frozen_data. */ +static void jbd2_freeze_jh_data(struct journal_head *jh) +{ + struct page *page; + int offset; + char *source; + struct buffer_head *bh = jh2bh(jh); + + J_EXPECT_JH(jh, buffer_uptodate(bh), "Possible IO failure.\n"); + page = bh->b_page; + offset = offset_in_page(bh->b_data); + source = kmap_atomic(page); + /* Fire data frozen trigger just before we copy the data */ + jbd2_buffer_frozen_trigger(jh, source + offset, jh->b_triggers); + memcpy(jh->b_frozen_data, source + offset, bh->b_size); + kunmap_atomic(source); + + /* + * Now that the frozen data is saved off, we need to store any matching + * triggers. + */ + jh->b_frozen_triggers = jh->b_triggers; +} + /* * If the buffer is already part of the current transaction, then there * is nothing we need to do. If it is already part of a prior @@ -780,7 +798,6 @@ do_get_write_access(handle_t *handle, struct journal_head *jh, journal_t *journal; int error; char *frozen_buffer = NULL; - int need_copy = 0; unsigned long start_lock, time_lock; if (is_handle_aborted(handle)) @@ -867,119 +884,96 @@ repeat: jh->b_modified = 0; /* + * If the buffer is not journaled right now, we need to make sure it + * doesn't get written to disk before the caller actually commits the + * new data + */ + if (!jh->b_transaction) { + JBUFFER_TRACE(jh, "no transaction"); + J_ASSERT_JH(jh, !jh->b_next_transaction); + JBUFFER_TRACE(jh, "file as BJ_Reserved"); + /* + * Make sure all stores to jh (b_modified, b_frozen_data) are + * visible before attaching it to the running transaction. + * Paired with barrier in jbd2_write_access_granted() + */ + smp_wmb(); + spin_lock(&journal->j_list_lock); + __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved); + spin_unlock(&journal->j_list_lock); + goto done; + } + /* * If there is already a copy-out version of this buffer, then we don't * need to make another one */ if (jh->b_frozen_data) { JBUFFER_TRACE(jh, "has frozen data"); J_ASSERT_JH(jh, jh->b_next_transaction == NULL); - jh->b_next_transaction = transaction; - goto done; + goto attach_next; } - /* Is there data here we need to preserve? */ + JBUFFER_TRACE(jh, "owned by older transaction"); + J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + J_ASSERT_JH(jh, jh->b_transaction == journal->j_committing_transaction); - if (jh->b_transaction && jh->b_transaction != transaction) { - JBUFFER_TRACE(jh, "owned by older transaction"); - J_ASSERT_JH(jh, jh->b_next_transaction == NULL); - J_ASSERT_JH(jh, jh->b_transaction == - journal->j_committing_transaction); + /* + * There is one case we have to be very careful about. If the + * committing transaction is currently writing this buffer out to disk + * and has NOT made a copy-out, then we cannot modify the buffer + * contents at all right now. The essence of copy-out is that it is + * the extra copy, not the primary copy, which gets journaled. If the + * primary copy is already going to disk then we cannot do copy-out + * here. + */ + if (buffer_shadow(bh)) { + JBUFFER_TRACE(jh, "on shadow: sleep"); + jbd_unlock_bh_state(bh); + wait_on_bit_io(&bh->b_state, BH_Shadow, TASK_UNINTERRUPTIBLE); + goto repeat; + } - /* There is one case we have to be very careful about. - * If the committing transaction is currently writing - * this buffer out to disk and has NOT made a copy-out, - * then we cannot modify the buffer contents at all - * right now. The essence of copy-out is that it is the - * extra copy, not the primary copy, which gets - * journaled. If the primary copy is already going to - * disk then we cannot do copy-out here. */ - - if (buffer_shadow(bh)) { - JBUFFER_TRACE(jh, "on shadow: sleep"); + /* + * Only do the copy if the currently-owning transaction still needs it. + * If buffer isn't on BJ_Metadata list, the committing transaction is + * past that stage (here we use the fact that BH_Shadow is set under + * bh_state lock together with refiling to BJ_Shadow list and at this + * point we know the buffer doesn't have BH_Shadow set). + * + * Subtle point, though: if this is a get_undo_access, then we will be + * relying on the frozen_data to contain the new value of the + * committed_data record after the transaction, so we HAVE to force the + * frozen_data copy in that case. + */ + if (jh->b_jlist == BJ_Metadata || force_copy) { + JBUFFER_TRACE(jh, "generate frozen data"); + if (!frozen_buffer) { + JBUFFER_TRACE(jh, "allocate memory for buffer"); jbd_unlock_bh_state(bh); - wait_on_bit_io(&bh->b_state, BH_Shadow, - TASK_UNINTERRUPTIBLE); - goto repeat; - } - - /* - * Only do the copy if the currently-owning transaction still - * needs it. If buffer isn't on BJ_Metadata list, the - * committing transaction is past that stage (here we use the - * fact that BH_Shadow is set under bh_state lock together with - * refiling to BJ_Shadow list and at this point we know the - * buffer doesn't have BH_Shadow set). - * - * Subtle point, though: if this is a get_undo_access, - * then we will be relying on the frozen_data to contain - * the new value of the committed_data record after the - * transaction, so we HAVE to force the frozen_data copy - * in that case. - */ - if (jh->b_jlist == BJ_Metadata || force_copy) { - JBUFFER_TRACE(jh, "generate frozen data"); + frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS); if (!frozen_buffer) { - JBUFFER_TRACE(jh, "allocate memory for buffer"); - jbd_unlock_bh_state(bh); - frozen_buffer = - jbd2_alloc(jh2bh(jh)->b_size, - GFP_NOFS); - if (!frozen_buffer) { - printk(KERN_ERR - "%s: OOM for frozen_buffer\n", - __func__); - JBUFFER_TRACE(jh, "oom!"); - error = -ENOMEM; - jbd_lock_bh_state(bh); - goto done; - } - goto repeat; + printk(KERN_ERR "%s: OOM for frozen_buffer\n", + __func__); + JBUFFER_TRACE(jh, "oom!"); + error = -ENOMEM; + goto out; } - jh->b_frozen_data = frozen_buffer; - frozen_buffer = NULL; - need_copy = 1; + goto repeat; } - jh->b_next_transaction = transaction; + jh->b_frozen_data = frozen_buffer; + frozen_buffer = NULL; + jbd2_freeze_jh_data(jh); } - - +attach_next: /* - * Finally, if the buffer is not journaled right now, we need to make - * sure it doesn't get written to disk before the caller actually - * commits the new data + * Make sure all stores to jh (b_modified, b_frozen_data) are visible + * before attaching it to the running transaction. Paired with barrier + * in jbd2_write_access_granted() */ - if (!jh->b_transaction) { - JBUFFER_TRACE(jh, "no transaction"); - J_ASSERT_JH(jh, !jh->b_next_transaction); - JBUFFER_TRACE(jh, "file as BJ_Reserved"); - spin_lock(&journal->j_list_lock); - __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved); - spin_unlock(&journal->j_list_lock); - } + smp_wmb(); + jh->b_next_transaction = transaction; done: - if (need_copy) { - struct page *page; - int offset; - char *source; - - J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)), - "Possible IO failure.\n"); - page = jh2bh(jh)->b_page; - offset = offset_in_page(jh2bh(jh)->b_data); - source = kmap_atomic(page); - /* Fire data frozen trigger just before we copy the data */ - jbd2_buffer_frozen_trigger(jh, source + offset, - jh->b_triggers); - memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size); - kunmap_atomic(source); - - /* - * Now that the frozen data is saved off, we need to store - * any matching triggers. - */ - jh->b_frozen_triggers = jh->b_triggers; - } jbd_unlock_bh_state(bh); /* @@ -996,6 +990,55 @@ out: return error; } +/* Fast check whether buffer is already attached to the required transaction */ +static bool jbd2_write_access_granted(handle_t *handle, struct buffer_head *bh) +{ + struct journal_head *jh; + bool ret = false; + + /* Dirty buffers require special handling... */ + if (buffer_dirty(bh)) + return false; + + /* + * RCU protects us from dereferencing freed pages. So the checks we do + * are guaranteed not to oops. However the jh slab object can get freed + * & reallocated while we work with it. So we have to be careful. When + * we see jh attached to the running transaction, we know it must stay + * so until the transaction is committed. Thus jh won't be freed and + * will be attached to the same bh while we run. However it can + * happen jh gets freed, reallocated, and attached to the transaction + * just after we get pointer to it from bh. So we have to be careful + * and recheck jh still belongs to our bh before we return success. + */ + rcu_read_lock(); + if (!buffer_jbd(bh)) + goto out; + /* This should be bh2jh() but that doesn't work with inline functions */ + jh = READ_ONCE(bh->b_private); + if (!jh) + goto out; + if (jh->b_transaction != handle->h_transaction && + jh->b_next_transaction != handle->h_transaction) + goto out; + /* + * There are two reasons for the barrier here: + * 1) Make sure to fetch b_bh after we did previous checks so that we + * detect when jh went through free, realloc, attach to transaction + * while we were checking. Paired with implicit barrier in that path. + * 2) So that access to bh done after jbd2_write_access_granted() + * doesn't get reordered and see inconsistent state of concurrent + * do_get_write_access(). + */ + smp_mb(); + if (unlikely(jh->b_bh != bh)) + goto out; + ret = true; +out: + rcu_read_unlock(); + return ret; +} + /** * int jbd2_journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update. * @handle: transaction to add buffer modifications to @@ -1009,9 +1052,13 @@ out: int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh) { - struct journal_head *jh = jbd2_journal_add_journal_head(bh); + struct journal_head *jh; int rc; + if (jbd2_write_access_granted(handle, bh)) + return 0; + + jh = jbd2_journal_add_journal_head(bh); /* We do not want to get caught playing with fields which the * log thread also manipulates. Make sure that the buffer * completes any outstanding IO before proceeding. */ @@ -1141,11 +1188,14 @@ out: int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh) { int err; - struct journal_head *jh = jbd2_journal_add_journal_head(bh); + struct journal_head *jh; char *committed_data = NULL; JBUFFER_TRACE(jh, "entry"); + if (jbd2_write_access_granted(handle, bh)) + return 0; + jh = jbd2_journal_add_journal_head(bh); /* * Do this first --- it can drop the journal lock, so we want to * make sure that obtaining the committed_data is done @@ -1230,8 +1280,6 @@ void jbd2_buffer_abort_trigger(struct journal_head *jh, triggers->t_abort(triggers, jh2bh(jh)); } - - /** * int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata * @handle: transaction to add buffer to. @@ -1264,12 +1312,36 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) if (is_handle_aborted(handle)) return -EROFS; - journal = transaction->t_journal; - jh = jbd2_journal_grab_journal_head(bh); - if (!jh) { + if (!buffer_jbd(bh)) { ret = -EUCLEAN; goto out; } + /* + * We don't grab jh reference here since the buffer must be part + * of the running transaction. + */ + jh = bh2jh(bh); + J_ASSERT_JH(jh, jh->b_transaction == transaction || + jh->b_next_transaction == transaction); + if (jh->b_modified == 1) { + /* + * If it's in our transaction it must be in BJ_Metadata list. + * The assertion is unreliable since we may see jh in + * inconsistent state unless we grab bh_state lock. But this + * is crutial to catch bugs so let's do a reliable check until + * the lockless handling is fully proven. + */ + if (jh->b_transaction == transaction && + jh->b_jlist != BJ_Metadata) { + jbd_lock_bh_state(bh); + J_ASSERT_JH(jh, jh->b_transaction != transaction || + jh->b_jlist == BJ_Metadata); + jbd_unlock_bh_state(bh); + } + goto out; + } + + journal = transaction->t_journal; jbd_debug(5, "journal_head %p\n", jh); JBUFFER_TRACE(jh, "entry"); @@ -1360,7 +1432,6 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) spin_unlock(&journal->j_list_lock); out_unlock_bh: jbd_unlock_bh_state(bh); - jbd2_journal_put_journal_head(jh); out: JBUFFER_TRACE(jh, "exit"); return ret; @@ -605,6 +605,8 @@ alloc_new: bio_get_nr_vecs(bdev), GFP_NOFS|__GFP_HIGH); if (bio == NULL) goto confused; + + wbc_init_bio(wbc, bio); } /* @@ -612,6 +614,7 @@ alloc_new: * the confused fail path above (OOM) will be very confused when * it finds all bh marked clean (i.e. it will not write anything) */ + wbc_account_io(wbc, page, PAGE_SIZE); length = first_unmapped << blkbits; if (bio_add_page(bio, page, length, 0) < length) { bio = mpage_bio_submit(WRITE, bio); diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index a46bf6d..b34f2e2 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -32,6 +32,7 @@ #include <linux/nfs_fs.h> #include <linux/nfs_page.h> #include <linux/module.h> +#include <linux/backing-dev.h> #include <linux/sunrpc/metrics.h> diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 9e6475b..7e3c460 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -607,7 +607,7 @@ void nfs_mark_page_unstable(struct page *page) struct inode *inode = page_file_mapping(page)->host; inc_zone_page_state(page, NR_UNSTABLE_NFS); - inc_bdi_stat(inode_to_bdi(inode), BDI_RECLAIMABLE); + inc_wb_stat(&inode_to_bdi(inode)->wb, WB_RECLAIMABLE); __mark_inode_dirty(inode, I_DIRTY_DATASYNC); } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index dfc19f1..e6c2625 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -853,7 +853,8 @@ static void nfs_clear_page_commit(struct page *page) { dec_zone_page_state(page, NR_UNSTABLE_NFS); - dec_bdi_stat(inode_to_bdi(page_file_mapping(page)->host), BDI_RECLAIMABLE); + dec_wb_stat(&inode_to_bdi(page_file_mapping(page)->host)->wb, + WB_RECLAIMABLE); } /* Called holding inode (/cinfo) lock */ diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index dc3a9efd..42468e5 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -343,11 +343,6 @@ static void nilfs_end_bio_write(struct bio *bio, int err) const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct nilfs_segment_buffer *segbuf = bio->bi_private; - if (err == -EOPNOTSUPP) { - set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); - /* to be detected by nilfs_segbuf_submit_bio() */ - } - if (!uptodate) atomic_inc(&segbuf->sb_err); @@ -374,15 +369,8 @@ static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf, bio->bi_end_io = nilfs_end_bio_write; bio->bi_private = segbuf; - bio_get(bio); submit_bio(mode, bio); segbuf->sb_nbio++; - if (bio_flagged(bio, BIO_EOPNOTSUPP)) { - bio_put(bio); - err = -EOPNOTSUPP; - goto failed; - } - bio_put(bio); wi->bio = NULL; wi->rest_blocks -= wi->end - wi->start; diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index fbfadb2..719f7f4 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -37,6 +37,7 @@ #include <linux/falloc.h> #include <linux/quotaops.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <cluster/masklog.h> diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index d766bfa..0e4cf728 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -21,6 +21,7 @@ #include "xattr.h" #include <linux/init.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <linux/buffer_head.h> #include <linux/exportfs.h> #include <linux/quotaops.h> diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 20f5dbd..9547a278 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -2246,7 +2246,9 @@ static int __init ubifs_init(void) if (!ubifs_inode_slab) return -ENOMEM; - register_shrinker(&ubifs_shrinker_info); + err = register_shrinker(&ubifs_shrinker_info); + if (err) + goto out_slab; err = ubifs_compressors_init(); if (err) @@ -2270,6 +2272,7 @@ out_compr: ubifs_compressors_exit(); out_shrinker: unregister_shrinker(&ubifs_shrinker_info); +out_slab: kmem_cache_destroy(ubifs_inode_slab); return err; } diff --git a/fs/ufs/super.c b/fs/ufs/super.c index b3bc3e7..098508a 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -80,6 +80,7 @@ #include <linux/stat.h> #include <linux/string.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <linux/init.h> #include <linux/parser.h> #include <linux/buffer_head.h> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index a56960d..e5099f2 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -356,7 +356,6 @@ xfs_end_bio( { xfs_ioend_t *ioend = bio->bi_private; - ASSERT(atomic_read(&bio->bi_cnt) >= 1); ioend->io_error = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : error; /* Toss bio and pass work off to an xfsdatad thread */ @@ -1874,6 +1873,7 @@ xfs_vm_set_page_dirty( loff_t end_offset; loff_t offset; int newly_dirty; + struct mem_cgroup *memcg; if (unlikely(!mapping)) return !TestSetPageDirty(page); @@ -1893,6 +1893,11 @@ xfs_vm_set_page_dirty( offset += 1 << inode->i_blkbits; } while (bh != head); } + /* + * Use mem_group_begin_page_stat() to keep PageDirty synchronized with + * per-memcg dirty page counters. + */ + memcg = mem_cgroup_begin_page_stat(page); newly_dirty = !TestSetPageDirty(page); spin_unlock(&mapping->private_lock); @@ -1903,13 +1908,15 @@ xfs_vm_set_page_dirty( spin_lock_irqsave(&mapping->tree_lock, flags); if (page->mapping) { /* Race with truncate? */ WARN_ON_ONCE(!PageUptodate(page)); - account_page_dirtied(page, mapping); + account_page_dirtied(page, mapping, memcg); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } spin_unlock_irqrestore(&mapping->tree_lock, flags); - __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } + mem_cgroup_end_page_stat(memcg); + if (newly_dirty) + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); return newly_dirty; } diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 3b75912..7c62fca 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -41,6 +41,7 @@ #include <linux/dcache.h> #include <linux/falloc.h> #include <linux/pagevec.h> +#include <linux/backing-dev.h> static const struct vm_operations_struct xfs_file_vm_ops; |