summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/btrfs/check-integrity.c162
-rw-r--r--fs/btrfs/compression.c18
-rw-r--r--fs/btrfs/ctree.c2
-rw-r--r--fs/btrfs/ctree.h11
-rw-r--r--fs/btrfs/dev-replace.c12
-rw-r--r--fs/btrfs/dir-item.c10
-rw-r--r--fs/btrfs/disk-io.c28
-rw-r--r--fs/btrfs/extent-tree.c36
-rw-r--r--fs/btrfs/extent_io.c41
-rw-r--r--fs/btrfs/extent_io.h1
-rw-r--r--fs/btrfs/extent_map.c2
-rw-r--r--fs/btrfs/file.c41
-rw-r--r--fs/btrfs/inode.c105
-rw-r--r--fs/btrfs/ordered-data.c49
-rw-r--r--fs/btrfs/ordered-data.h12
-rw-r--r--fs/btrfs/scrub.c90
-rw-r--r--fs/btrfs/super.c74
-rw-r--r--fs/btrfs/transaction.c128
-rw-r--r--fs/btrfs/transaction.h4
-rw-r--r--fs/btrfs/tree-log.c50
-rw-r--r--fs/btrfs/xattr.c150
-rw-r--r--fs/ceph/caps.c2
-rw-r--r--fs/nfs/blocklayout/blocklayout.c2
-rw-r--r--fs/nfs/blocklayout/rpc_pipefs.c14
-rw-r--r--fs/nfs/delegation.c25
-rw-r--r--fs/nfs/delegation.h1
-rw-r--r--fs/nfs/dir.c1
-rw-r--r--fs/nfs/direct.c1
-rw-r--r--fs/nfs/filelayout/filelayout.c3
-rw-r--r--fs/nfs/inode.c2
-rw-r--r--fs/nfs/netns.h1
-rw-r--r--fs/nfs/nfs4proc.c95
-rw-r--r--fs/nfs/write.c2
-rw-r--r--fs/notify/fsnotify.c36
-rw-r--r--fs/notify/fsnotify.h4
-rw-r--r--fs/notify/inode_mark.c8
-rw-r--r--fs/notify/mark.c36
-rw-r--r--fs/notify/vfsmount_mark.c8
38 files changed, 817 insertions, 450 deletions
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index cb7f3fe..47cb267 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -326,9 +326,6 @@ static int btrfsic_handle_extent_data(struct btrfsic_state *state,
static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
struct btrfsic_block_data_ctx *block_ctx_out,
int mirror_num);
-static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
- u32 len, struct block_device *bdev,
- struct btrfsic_block_data_ctx *block_ctx_out);
static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx);
static int btrfsic_read_block(struct btrfsic_state *state,
struct btrfsic_block_data_ctx *block_ctx);
@@ -1326,24 +1323,25 @@ static int btrfsic_create_link_to_next_block(
l = NULL;
next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
} else {
- if (next_block->logical_bytenr != next_bytenr &&
- !(!next_block->is_metadata &&
- 0 == next_block->logical_bytenr)) {
- printk(KERN_INFO
- "Referenced block @%llu (%s/%llu/%d)"
- " found in hash table, %c,"
- " bytenr mismatch (!= stored %llu).\n",
- next_bytenr, next_block_ctx->dev->name,
- next_block_ctx->dev_bytenr, *mirror_nump,
- btrfsic_get_block_type(state, next_block),
- next_block->logical_bytenr);
- } else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- printk(KERN_INFO
- "Referenced block @%llu (%s/%llu/%d)"
- " found in hash table, %c.\n",
- next_bytenr, next_block_ctx->dev->name,
- next_block_ctx->dev_bytenr, *mirror_nump,
- btrfsic_get_block_type(state, next_block));
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) {
+ if (next_block->logical_bytenr != next_bytenr &&
+ !(!next_block->is_metadata &&
+ 0 == next_block->logical_bytenr))
+ printk(KERN_INFO
+ "Referenced block @%llu (%s/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu).\n",
+ next_bytenr, next_block_ctx->dev->name,
+ next_block_ctx->dev_bytenr, *mirror_nump,
+ btrfsic_get_block_type(state,
+ next_block),
+ next_block->logical_bytenr);
+ else
+ printk(KERN_INFO
+ "Referenced block @%llu (%s/%llu/%d) found in hash table, %c.\n",
+ next_bytenr, next_block_ctx->dev->name,
+ next_block_ctx->dev_bytenr, *mirror_nump,
+ btrfsic_get_block_type(state,
+ next_block));
+ }
next_block->logical_bytenr = next_bytenr;
next_block->mirror_num = *mirror_nump;
@@ -1529,7 +1527,9 @@ static int btrfsic_handle_extent_data(
return -1;
}
if (!block_was_created) {
- if (next_block->logical_bytenr != next_bytenr &&
+ if ((state->print_mask &
+ BTRFSIC_PRINT_MASK_VERBOSE) &&
+ next_block->logical_bytenr != next_bytenr &&
!(!next_block->is_metadata &&
0 == next_block->logical_bytenr)) {
printk(KERN_INFO
@@ -1607,25 +1607,6 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
return ret;
}
-static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
- u32 len, struct block_device *bdev,
- struct btrfsic_block_data_ctx *block_ctx_out)
-{
- block_ctx_out->dev = btrfsic_dev_state_lookup(bdev);
- block_ctx_out->dev_bytenr = bytenr;
- block_ctx_out->start = bytenr;
- block_ctx_out->len = len;
- block_ctx_out->datav = NULL;
- block_ctx_out->pagev = NULL;
- block_ctx_out->mem_to_free = NULL;
- if (NULL != block_ctx_out->dev) {
- return 0;
- } else {
- printk(KERN_INFO "btrfsic: error, cannot lookup dev (#2)!\n");
- return -ENXIO;
- }
-}
-
static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx)
{
if (block_ctx->mem_to_free) {
@@ -1901,25 +1882,26 @@ again:
dev_state,
dev_bytenr);
}
- if (block->logical_bytenr != bytenr &&
- !(!block->is_metadata &&
- block->logical_bytenr == 0))
- printk(KERN_INFO
- "Written block @%llu (%s/%llu/%d)"
- " found in hash table, %c,"
- " bytenr mismatch"
- " (!= stored %llu).\n",
- bytenr, dev_state->name, dev_bytenr,
- block->mirror_num,
- btrfsic_get_block_type(state, block),
- block->logical_bytenr);
- else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
- printk(KERN_INFO
- "Written block @%llu (%s/%llu/%d)"
- " found in hash table, %c.\n",
- bytenr, dev_state->name, dev_bytenr,
- block->mirror_num,
- btrfsic_get_block_type(state, block));
+ if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) {
+ if (block->logical_bytenr != bytenr &&
+ !(!block->is_metadata &&
+ block->logical_bytenr == 0))
+ printk(KERN_INFO
+ "Written block @%llu (%s/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu).\n",
+ bytenr, dev_state->name,
+ dev_bytenr,
+ block->mirror_num,
+ btrfsic_get_block_type(state,
+ block),
+ block->logical_bytenr);
+ else
+ printk(KERN_INFO
+ "Written block @%llu (%s/%llu/%d) found in hash table, %c.\n",
+ bytenr, dev_state->name,
+ dev_bytenr, block->mirror_num,
+ btrfsic_get_block_type(state,
+ block));
+ }
block->logical_bytenr = bytenr;
} else {
if (num_pages * PAGE_CACHE_SIZE <
@@ -2002,24 +1984,13 @@ again:
}
}
- if (block->is_superblock)
- ret = btrfsic_map_superblock(state, bytenr,
- processed_len,
- bdev, &block_ctx);
- else
- ret = btrfsic_map_block(state, bytenr, processed_len,
- &block_ctx, 0);
- if (ret) {
- printk(KERN_INFO
- "btrfsic: btrfsic_map_block(root @%llu)"
- " failed!\n", bytenr);
- goto continue_loop;
- }
- block_ctx.datav = mapped_datav;
- /* the following is required in case of writes to mirrors,
- * use the same that was used for the lookup */
block_ctx.dev = dev_state;
block_ctx.dev_bytenr = dev_bytenr;
+ block_ctx.start = bytenr;
+ block_ctx.len = processed_len;
+ block_ctx.pagev = NULL;
+ block_ctx.mem_to_free = NULL;
+ block_ctx.datav = mapped_datav;
if (is_metadata || state->include_extent_data) {
block->never_written = 0;
@@ -2133,10 +2104,6 @@ again:
/* this is getting ugly for the
* include_extent_data case... */
bytenr = 0; /* unknown */
- block_ctx.start = bytenr;
- block_ctx.len = processed_len;
- block_ctx.mem_to_free = NULL;
- block_ctx.pagev = NULL;
} else {
processed_len = state->metablock_size;
bytenr = btrfs_stack_header_bytenr(
@@ -2149,22 +2116,15 @@ again:
"Written block @%llu (%s/%llu/?)"
" !found in hash table, M.\n",
bytenr, dev_state->name, dev_bytenr);
-
- ret = btrfsic_map_block(state, bytenr, processed_len,
- &block_ctx, 0);
- if (ret) {
- printk(KERN_INFO
- "btrfsic: btrfsic_map_block(root @%llu)"
- " failed!\n",
- dev_bytenr);
- goto continue_loop;
- }
}
- block_ctx.datav = mapped_datav;
- /* the following is required in case of writes to mirrors,
- * use the same that was used for the lookup */
+
block_ctx.dev = dev_state;
block_ctx.dev_bytenr = dev_bytenr;
+ block_ctx.start = bytenr;
+ block_ctx.len = processed_len;
+ block_ctx.pagev = NULL;
+ block_ctx.mem_to_free = NULL;
+ block_ctx.datav = mapped_datav;
block = btrfsic_block_alloc();
if (NULL == block) {
@@ -3130,10 +3090,13 @@ int btrfsic_mount(struct btrfs_root *root,
root->sectorsize, PAGE_CACHE_SIZE);
return -1;
}
- state = kzalloc(sizeof(*state), GFP_NOFS);
- if (NULL == state) {
- printk(KERN_INFO "btrfs check-integrity: kmalloc() failed!\n");
- return -1;
+ state = kzalloc(sizeof(*state), GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
+ if (!state) {
+ state = vzalloc(sizeof(*state));
+ if (!state) {
+ printk(KERN_INFO "btrfs check-integrity: vzalloc() failed!\n");
+ return -1;
+ }
}
if (!btrfsic_is_initialized) {
@@ -3277,5 +3240,8 @@ void btrfsic_unmount(struct btrfs_root *root,
mutex_unlock(&btrfsic_mutex);
- kfree(state);
+ if (is_vmalloc_addr(state))
+ vfree(state);
+ else
+ kfree(state);
}
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index d3220d3..1bf411b 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -224,16 +224,19 @@ out:
* Clear the writeback bits on all of the file
* pages for a compressed write
*/
-static noinline void end_compressed_writeback(struct inode *inode, u64 start,
- unsigned long ram_size)
+static noinline void end_compressed_writeback(struct inode *inode,
+ const struct compressed_bio *cb)
{
- unsigned long index = start >> PAGE_CACHE_SHIFT;
- unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT;
+ unsigned long index = cb->start >> PAGE_CACHE_SHIFT;
+ unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_CACHE_SHIFT;
struct page *pages[16];
unsigned long nr_pages = end_index - index + 1;
int i;
int ret;
+ if (cb->errors)
+ mapping_set_error(inode->i_mapping, -EIO);
+
while (nr_pages > 0) {
ret = find_get_pages_contig(inode->i_mapping, index,
min_t(unsigned long,
@@ -244,6 +247,8 @@ static noinline void end_compressed_writeback(struct inode *inode, u64 start,
continue;
}
for (i = 0; i < ret; i++) {
+ if (cb->errors)
+ SetPageError(pages[i]);
end_page_writeback(pages[i]);
page_cache_release(pages[i]);
}
@@ -287,10 +292,11 @@ static void end_compressed_bio_write(struct bio *bio, int err)
tree->ops->writepage_end_io_hook(cb->compressed_pages[0],
cb->start,
cb->start + cb->len - 1,
- NULL, 1);
+ NULL,
+ err ? 0 : 1);
cb->compressed_pages[0]->mapping = NULL;
- end_compressed_writeback(inode, cb->start, cb->len);
+ end_compressed_writeback(inode, cb);
/* note, our inode could be gone now */
/*
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 19bc616..8172341 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2939,7 +2939,7 @@ done:
*/
if (!p->leave_spinning)
btrfs_set_path_blocking(p);
- if (ret < 0)
+ if (ret < 0 && !p->skip_release_on_error)
btrfs_release_path(p);
return ret;
}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 817fc19..9918ba3 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -607,6 +607,7 @@ struct btrfs_path {
unsigned int leave_spinning:1;
unsigned int search_commit_root:1;
unsigned int need_commit_sem:1;
+ unsigned int skip_release_on_error:1;
};
/*
@@ -1170,6 +1171,7 @@ struct btrfs_space_info {
struct percpu_counter total_bytes_pinned;
struct list_head list;
+ struct list_head ro_bgs;
struct rw_semaphore groups_sem;
/* for block groups in our same type */
@@ -1305,6 +1307,9 @@ struct btrfs_block_group_cache {
/* For delayed block group creation or deletion of empty block groups */
struct list_head bg_list;
+
+ /* For read-only block groups */
+ struct list_head ro_list;
};
/* delayed seq elem */
@@ -3734,6 +3739,10 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
int verify_dir_item(struct btrfs_root *root,
struct extent_buffer *leaf,
struct btrfs_dir_item *dir_item);
+struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
+ struct btrfs_path *path,
+ const char *name,
+ int name_len);
/* orphan.c */
int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
@@ -3905,6 +3914,7 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
struct btrfs_trans_handle *trans, int mode,
u64 start, u64 num_bytes, u64 min_size,
loff_t actual_len, u64 *alloc_hint);
+int btrfs_inode_check_errors(struct inode *inode);
extern const struct dentry_operations btrfs_dentry_operations;
/* ioctl.c */
@@ -3949,6 +3959,7 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
struct page **pages, size_t num_pages,
loff_t pos, size_t write_bytes,
struct extent_state **cached);
+int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
/* tree-defrag.c */
int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 6f662b3..971c061 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -422,9 +422,15 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
&dev_replace->scrub_progress, 0, 1);
ret = btrfs_dev_replace_finishing(root->fs_info, ret);
- WARN_ON(ret);
+ /* don't warn if EINPROGRESS, someone else might be running scrub */
+ if (ret == -EINPROGRESS) {
+ args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS;
+ ret = 0;
+ } else {
+ WARN_ON(ret);
+ }
- return 0;
+ return ret;
leave:
dev_replace->srcdev = NULL;
@@ -542,7 +548,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
- return 0;
+ return scrub_ret;
}
printk_in_rcu(KERN_INFO
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index fc8df86..1752625 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -21,10 +21,6 @@
#include "hash.h"
#include "transaction.h"
-static struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
- struct btrfs_path *path,
- const char *name, int name_len);
-
/*
* insert a name into a directory, doing overflow properly if there is a hash
* collision. data_size indicates how big the item inserted should be. On
@@ -383,9 +379,9 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
* this walks through all the entries in a dir item and finds one
* for a specific name.
*/
-static struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
- struct btrfs_path *path,
- const char *name, int name_len)
+struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
+ struct btrfs_path *path,
+ const char *name, int name_len)
{
struct btrfs_dir_item *dir_item;
unsigned long name_ptr;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6b406e3..1e3e414 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3841,12 +3841,12 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
*/
if (!IS_ALIGNED(btrfs_super_root(sb), 4096))
printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
- sb->root);
+ btrfs_super_root(sb));
if (!IS_ALIGNED(btrfs_super_chunk_root(sb), 4096))
- printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
- sb->chunk_root);
+ printk(KERN_WARNING "BTRFS: chunk_root block unaligned: %llu\n",
+ btrfs_super_chunk_root(sb));
if (!IS_ALIGNED(btrfs_super_log_root(sb), 4096))
- printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
+ printk(KERN_WARNING "BTRFS: log_root block unaligned: %llu\n",
btrfs_super_log_root(sb));
if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) {
@@ -4131,6 +4131,25 @@ again:
return 0;
}
+static void btrfs_free_pending_ordered(struct btrfs_transaction *cur_trans,
+ struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_ordered_extent *ordered;
+
+ spin_lock(&fs_info->trans_lock);
+ while (!list_empty(&cur_trans->pending_ordered)) {
+ ordered = list_first_entry(&cur_trans->pending_ordered,
+ struct btrfs_ordered_extent,
+ trans_list);
+ list_del_init(&ordered->trans_list);
+ spin_unlock(&fs_info->trans_lock);
+
+ btrfs_put_ordered_extent(ordered);
+ spin_lock(&fs_info->trans_lock);
+ }
+ spin_unlock(&fs_info->trans_lock);
+}
+
void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
struct btrfs_root *root)
{
@@ -4142,6 +4161,7 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
cur_trans->state = TRANS_STATE_UNBLOCKED;
wake_up(&root->fs_info->transaction_wait);
+ btrfs_free_pending_ordered(cur_trans, root->fs_info);
btrfs_destroy_delayed_inodes(root);
btrfs_assert_delayed_root_empty(root);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 47c1ba1..cf3460e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3504,6 +3504,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
found->chunk_alloc = 0;
found->flush = 0;
init_waitqueue_head(&found->wait);
+ INIT_LIST_HEAD(&found->ro_bgs);
ret = kobject_init_and_add(&found->kobj, &space_info_ktype,
info->space_info_kobj, "%s",
@@ -8511,6 +8512,7 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
min_allocable_bytes <= sinfo->total_bytes) {
sinfo->bytes_readonly += num_bytes;
cache->ro = 1;
+ list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
ret = 0;
}
out:
@@ -8565,15 +8567,20 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
/*
* helper to account the unused space of all the readonly block group in the
- * list. takes mirrors into account.
+ * space_info. takes mirrors into account.
*/
-static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
+u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
{
struct btrfs_block_group_cache *block_group;
u64 free_bytes = 0;
int factor;
- list_for_each_entry(block_group, groups_list, list) {
+ /* It's df, we don't care if it's racey */
+ if (list_empty(&sinfo->ro_bgs))
+ return 0;
+
+ spin_lock(&sinfo->lock);
+ list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
spin_lock(&block_group->lock);
if (!block_group->ro) {
@@ -8594,26 +8601,6 @@ static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
spin_unlock(&block_group->lock);
}
-
- return free_bytes;
-}
-
-/*
- * helper to account the unused space of all the readonly block group in the
- * space_info. takes mirrors into account.
- */
-u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
-{
- int i;
- u64 free_bytes = 0;
-
- spin_lock(&sinfo->lock);
-
- for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
- if (!list_empty(&sinfo->block_groups[i]))
- free_bytes += __btrfs_get_ro_block_group_free_space(
- &sinfo->block_groups[i]);
-
spin_unlock(&sinfo->lock);
return free_bytes;
@@ -8633,6 +8620,7 @@ void btrfs_set_block_group_rw(struct btrfs_root *root,
cache->bytes_super - btrfs_block_group_used(&cache->item);
sinfo->bytes_readonly -= num_bytes;
cache->ro = 0;
+ list_del_init(&cache->ro_list);
spin_unlock(&cache->lock);
spin_unlock(&sinfo->lock);
}
@@ -9002,6 +8990,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
INIT_LIST_HEAD(&cache->list);
INIT_LIST_HEAD(&cache->cluster_list);
INIT_LIST_HEAD(&cache->bg_list);
+ INIT_LIST_HEAD(&cache->ro_list);
btrfs_init_free_space_ctl(cache);
return cache;
@@ -9411,6 +9400,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
* are still on the list after taking the semaphore
*/
list_del_init(&block_group->list);
+ list_del_init(&block_group->ro_list);
if (list_empty(&block_group->space_info->block_groups[index])) {
kobj = block_group->space_info->block_group_kobjs[index];
block_group->space_info->block_group_kobjs[index] = NULL;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index bf3f424..4ebabd2 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -595,9 +595,14 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
clear = 1;
again:
if (!prealloc && (mask & __GFP_WAIT)) {
+ /*
+ * Don't care for allocation failure here because we might end
+ * up not needing the pre-allocated extent state at all, which
+ * is the case if we only have in the tree extent states that
+ * cover our input range and don't cover too any other range.
+ * If we end up needing a new extent state we allocate it later.
+ */
prealloc = alloc_extent_state(mask);
- if (!prealloc)
- return -ENOMEM;
}
spin_lock(&tree->lock);
@@ -796,17 +801,25 @@ static void set_state_bits(struct extent_io_tree *tree,
state->state |= bits_to_set;
}
-static void cache_state(struct extent_state *state,
- struct extent_state **cached_ptr)
+static void cache_state_if_flags(struct extent_state *state,
+ struct extent_state **cached_ptr,
+ const u64 flags)
{
if (cached_ptr && !(*cached_ptr)) {
- if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) {
+ if (!flags || (state->state & flags)) {
*cached_ptr = state;
atomic_inc(&state->refs);
}
}
}
+static void cache_state(struct extent_state *state,
+ struct extent_state **cached_ptr)
+{
+ return cache_state_if_flags(state, cached_ptr,
+ EXTENT_IOBITS | EXTENT_BOUNDARY);
+}
+
/*
* set some bits on a range in the tree. This may require allocations or
* sleeping, so the gfp mask is used to indicate what is allowed.
@@ -1058,13 +1071,21 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
int err = 0;
u64 last_start;
u64 last_end;
+ bool first_iteration = true;
btrfs_debug_check_extent_io_range(tree, start, end);
again:
if (!prealloc && (mask & __GFP_WAIT)) {
+ /*
+ * Best effort, don't worry if extent state allocation fails
+ * here for the first iteration. We might have a cached state
+ * that matches exactly the target range, in which case no
+ * extent state allocations are needed. We'll only know this
+ * after locking the tree.
+ */
prealloc = alloc_extent_state(mask);
- if (!prealloc)
+ if (!prealloc && !first_iteration)
return -ENOMEM;
}
@@ -1234,6 +1255,7 @@ search_again:
spin_unlock(&tree->lock);
if (mask & __GFP_WAIT)
cond_resched();
+ first_iteration = false;
goto again;
}
@@ -1482,7 +1504,7 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
state = find_first_extent_bit_state(tree, start, bits);
got_it:
if (state) {
- cache_state(state, cached_state);
+ cache_state_if_flags(state, cached_state, 0);
*start_ret = state->start;
*end_ret = state->end;
ret = 0;
@@ -1746,6 +1768,9 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
if (page_ops == 0)
return 0;
+ if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0)
+ mapping_set_error(inode->i_mapping, -EIO);
+
while (nr_pages > 0) {
ret = find_get_pages_contig(inode->i_mapping, index,
min_t(unsigned long,
@@ -1763,6 +1788,8 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
clear_page_dirty_for_io(pages[i]);
if (page_ops & PAGE_SET_WRITEBACK)
set_page_writeback(pages[i]);
+ if (page_ops & PAGE_SET_ERROR)
+ SetPageError(pages[i]);
if (page_ops & PAGE_END_WRITEBACK)
end_page_writeback(pages[i]);
if (page_ops & PAGE_UNLOCK)
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 6d4b938..ece9ce8 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -49,6 +49,7 @@
#define PAGE_SET_WRITEBACK (1 << 2)
#define PAGE_END_WRITEBACK (1 << 3)
#define PAGE_SET_PRIVATE2 (1 << 4)
+#define PAGE_SET_ERROR (1 << 5)
/*
* page->private values. Every page that is controlled by the extent
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 225302b..6a98bdd 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -287,8 +287,6 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
if (!em)
goto out;
- if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
- list_move(&em->list, &tree->modified_extents);
em->generation = gen;
clear_bit(EXTENT_FLAG_PINNED, &em->flags);
em->mod_start = em->start;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index a18ceab..0fbf0e7 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1676,6 +1676,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
loff_t pos)
{
struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
ssize_t written;
ssize_t written_buffered;
loff_t endbyte;
@@ -1692,8 +1693,15 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
err = written_buffered;
goto out;
}
+ /*
+ * Ensure all data is persisted. We want the next direct IO read to be
+ * able to read what was just written.
+ */
endbyte = pos + written_buffered - 1;
- err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
+ err = btrfs_fdatawrite_range(inode, pos, endbyte);
+ if (err)
+ goto out;
+ err = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
if (err)
goto out;
written += written_buffered;
@@ -1854,10 +1862,7 @@ static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
int ret;
atomic_inc(&BTRFS_I(inode)->sync_writers);
- ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
- if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags))
- ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
+ ret = btrfs_fdatawrite_range(inode, start, end);
atomic_dec(&BTRFS_I(inode)->sync_writers);
return ret;
@@ -2810,3 +2815,29 @@ int btrfs_auto_defrag_init(void)
return 0;
}
+
+int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end)
+{
+ int ret;
+
+ /*
+ * So with compression we will find and lock a dirty page and clear the
+ * first one as dirty, setup an async extent, and immediately return
+ * with the entire range locked but with nobody actually marked with
+ * writeback. So we can't just filemap_write_and_wait_range() and
+ * expect it to work since it will just kick off a thread to do the
+ * actual work. So we need to call filemap_fdatawrite_range _again_
+ * since it will wait on the page lock, which won't be unlocked until
+ * after the pages have been marked as writeback and so we're good to go
+ * from there. We have to do this otherwise we'll miss the ordered
+ * extents and that results in badness. Please Josef, do not think you
+ * know better and pull this out at some point in the future, it is
+ * right and you are wrong.
+ */
+ ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
+ if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags))
+ ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
+
+ return ret;
+}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d23362f..a5374c2 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -382,7 +382,7 @@ static inline int inode_need_compress(struct inode *inode)
* are written in the same order that the flusher thread sent them
* down.
*/
-static noinline int compress_file_range(struct inode *inode,
+static noinline void compress_file_range(struct inode *inode,
struct page *locked_page,
u64 start, u64 end,
struct async_cow *async_cow,
@@ -411,14 +411,6 @@ static noinline int compress_file_range(struct inode *inode,
(start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
btrfs_add_inode_defrag(NULL, inode);
- /*
- * skip compression for a small file range(<=blocksize) that
- * isn't an inline extent, since it dosen't save disk space at all.
- */
- if ((end - start + 1) <= blocksize &&
- (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
- goto cleanup_and_bail_uncompressed;
-
actual_end = min_t(u64, isize, end + 1);
again:
will_compress = 0;
@@ -440,6 +432,14 @@ again:
total_compressed = actual_end - start;
+ /*
+ * skip compression for a small file range(<=blocksize) that
+ * isn't an inline extent, since it dosen't save disk space at all.
+ */
+ if (total_compressed <= blocksize &&
+ (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
+ goto cleanup_and_bail_uncompressed;
+
/* we want to make sure that amount of ram required to uncompress
* an extent is reasonable, so we limit the total size in ram
* of a compressed extent to 128k. This is a crucial number
@@ -527,7 +527,10 @@ cont:
if (ret <= 0) {
unsigned long clear_flags = EXTENT_DELALLOC |
EXTENT_DEFRAG;
+ unsigned long page_error_op;
+
clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0;
+ page_error_op = ret < 0 ? PAGE_SET_ERROR : 0;
/*
* inline extent creation worked or returned error,
@@ -538,6 +541,7 @@ cont:
clear_flags, PAGE_UNLOCK |
PAGE_CLEAR_DIRTY |
PAGE_SET_WRITEBACK |
+ page_error_op |
PAGE_END_WRITEBACK);
goto free_pages_out;
}
@@ -620,8 +624,7 @@ cleanup_and_bail_uncompressed:
*num_added += 1;
}
-out:
- return ret;
+ return;
free_pages_out:
for (i = 0; i < nr_pages_ret; i++) {
@@ -629,8 +632,22 @@ free_pages_out:
page_cache_release(pages[i]);
}
kfree(pages);
+}
- goto out;
+static void free_async_extent_pages(struct async_extent *async_extent)
+{
+ int i;
+
+ if (!async_extent->pages)
+ return;
+
+ for (i = 0; i < async_extent->nr_pages; i++) {
+ WARN_ON(async_extent->pages[i]->mapping);
+ page_cache_release(async_extent->pages[i]);
+ }
+ kfree(async_extent->pages);
+ async_extent->nr_pages = 0;
+ async_extent->pages = NULL;
}
/*
@@ -639,7 +656,7 @@ free_pages_out:
* queued. We walk all the async extents created by compress_file_range
* and send them down to the disk.
*/
-static noinline int submit_compressed_extents(struct inode *inode,
+static noinline void submit_compressed_extents(struct inode *inode,
struct async_cow *async_cow)
{
struct async_extent *async_extent;
@@ -651,9 +668,6 @@ static noinline int submit_compressed_extents(struct inode *inode,
struct extent_io_tree *io_tree;
int ret = 0;
- if (list_empty(&async_cow->extents))
- return 0;
-
again:
while (!list_empty(&async_cow->extents)) {
async_extent = list_entry(async_cow->extents.next,
@@ -709,15 +723,7 @@ retry:
async_extent->compressed_size,
0, alloc_hint, &ins, 1, 1);
if (ret) {
- int i;
-
- for (i = 0; i < async_extent->nr_pages; i++) {
- WARN_ON(async_extent->pages[i]->mapping);
- page_cache_release(async_extent->pages[i]);
- }
- kfree(async_extent->pages);
- async_extent->nr_pages = 0;
- async_extent->pages = NULL;
+ free_async_extent_pages(async_extent);
if (ret == -ENOSPC) {
unlock_extent(io_tree, async_extent->start,
@@ -814,15 +820,26 @@ retry:
ins.objectid,
ins.offset, async_extent->pages,
async_extent->nr_pages);
+ if (ret) {
+ struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+ struct page *p = async_extent->pages[0];
+ const u64 start = async_extent->start;
+ const u64 end = start + async_extent->ram_size - 1;
+
+ p->mapping = inode->i_mapping;
+ tree->ops->writepage_end_io_hook(p, start, end,
+ NULL, 0);
+ p->mapping = NULL;
+ extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
+ PAGE_END_WRITEBACK |
+ PAGE_SET_ERROR);
+ free_async_extent_pages(async_extent);
+ }
alloc_hint = ins.objectid + ins.offset;
kfree(async_extent);
- if (ret)
- goto out;
cond_resched();
}
- ret = 0;
-out:
- return ret;
+ return;
out_free_reserve:
btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
out_free:
@@ -832,7 +849,9 @@ out_free:
NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
- PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK);
+ PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
+ PAGE_SET_ERROR);
+ free_async_extent_pages(async_extent);
kfree(async_extent);
goto again;
}
@@ -7000,9 +7019,12 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
btrfs_put_ordered_extent(ordered);
} else {
/* Screw you mmap */
- ret = filemap_write_and_wait_range(inode->i_mapping,
- lockstart,
- lockend);
+ ret = btrfs_fdatawrite_range(inode, lockstart, lockend);
+ if (ret)
+ break;
+ ret = filemap_fdatawait_range(inode->i_mapping,
+ lockstart,
+ lockend);
if (ret)
break;
@@ -9442,6 +9464,21 @@ out_inode:
}
+/* Inspired by filemap_check_errors() */
+int btrfs_inode_check_errors(struct inode *inode)
+{
+ int ret = 0;
+
+ if (test_bit(AS_ENOSPC, &inode->i_mapping->flags) &&
+ test_and_clear_bit(AS_ENOSPC, &inode->i_mapping->flags))
+ ret = -ENOSPC;
+ if (test_bit(AS_EIO, &inode->i_mapping->flags) &&
+ test_and_clear_bit(AS_EIO, &inode->i_mapping->flags))
+ ret = -EIO;
+
+ return ret;
+}
+
static const struct inode_operations btrfs_dir_inode_operations = {
.getattr = btrfs_getattr,
.lookup = btrfs_lookup,
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index ac734ec..534544e 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -220,6 +220,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
INIT_LIST_HEAD(&entry->work_list);
init_completion(&entry->completion);
INIT_LIST_HEAD(&entry->log_list);
+ INIT_LIST_HEAD(&entry->trans_list);
trace_btrfs_ordered_extent_add(inode, entry);
@@ -431,19 +432,31 @@ out:
/* Needs to either be called under a log transaction or the log_mutex */
void btrfs_get_logged_extents(struct inode *inode,
- struct list_head *logged_list)
+ struct list_head *logged_list,
+ const loff_t start,
+ const loff_t end)
{
struct btrfs_ordered_inode_tree *tree;
struct btrfs_ordered_extent *ordered;
struct rb_node *n;
+ struct rb_node *prev;
tree = &BTRFS_I(inode)->ordered_tree;
spin_lock_irq(&tree->lock);
- for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
+ n = __tree_search(&tree->tree, end, &prev);
+ if (!n)
+ n = prev;
+ for (; n; n = rb_prev(n)) {
ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
+ if (ordered->file_offset > end)
+ continue;
+ if (entry_end(ordered) <= start)
+ break;
if (!list_empty(&ordered->log_list))
continue;
- list_add_tail(&ordered->log_list, logged_list);
+ if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
+ continue;
+ list_add(&ordered->log_list, logged_list);
atomic_inc(&ordered->refs);
}
spin_unlock_irq(&tree->lock);
@@ -472,7 +485,8 @@ void btrfs_submit_logged_extents(struct list_head *logged_list,
spin_unlock_irq(&log->log_extents_lock[index]);
}
-void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid)
+void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log, u64 transid)
{
struct btrfs_ordered_extent *ordered;
int index = transid % 2;
@@ -497,7 +511,8 @@ void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid)
wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE,
&ordered->flags));
- btrfs_put_ordered_extent(ordered);
+ if (!test_and_set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
+ list_add_tail(&ordered->trans_list, &trans->ordered);
spin_lock_irq(&log->log_extents_lock[index]);
}
spin_unlock_irq(&log->log_extents_lock[index]);
@@ -725,30 +740,10 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
/* start IO across the range first to instantiate any delalloc
* extents
*/
- ret = filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
+ ret = btrfs_fdatawrite_range(inode, start, orig_end);
if (ret)
return ret;
- /*
- * So with compression we will find and lock a dirty page and clear the
- * first one as dirty, setup an async extent, and immediately return
- * with the entire range locked but with nobody actually marked with
- * writeback. So we can't just filemap_write_and_wait_range() and
- * expect it to work since it will just kick off a thread to do the
- * actual work. So we need to call filemap_fdatawrite_range _again_
- * since it will wait on the page lock, which won't be unlocked until
- * after the pages have been marked as writeback and so we're good to go
- * from there. We have to do this otherwise we'll miss the ordered
- * extents and that results in badness. Please Josef, do not think you
- * know better and pull this out at some point in the future, it is
- * right and you are wrong.
- */
- if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags)) {
- ret = filemap_fdatawrite_range(inode->i_mapping, start,
- orig_end);
- if (ret)
- return ret;
- }
+
ret = filemap_fdatawait_range(inode->i_mapping, start, orig_end);
if (ret)
return ret;
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index d81a274..e96cd4c 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -71,6 +71,8 @@ struct btrfs_ordered_sum {
ordered extent */
#define BTRFS_ORDERED_TRUNCATED 9 /* Set when we have to truncate an extent */
+#define BTRFS_ORDERED_LOGGED 10 /* Set when we've waited on this ordered extent
+ * in the logging code. */
struct btrfs_ordered_extent {
/* logical offset in the file */
u64 file_offset;
@@ -121,6 +123,9 @@ struct btrfs_ordered_extent {
/* If we need to wait on this to be done */
struct list_head log_list;
+ /* If the transaction needs to wait on this ordered extent */
+ struct list_head trans_list;
+
/* used to wait for the BTRFS_ORDERED_COMPLETE bit */
wait_queue_head_t wait;
@@ -193,11 +198,14 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr);
void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr);
void btrfs_get_logged_extents(struct inode *inode,
- struct list_head *logged_list);
+ struct list_head *logged_list,
+ const loff_t start,
+ const loff_t end);
void btrfs_put_logged_extents(struct list_head *logged_list);
void btrfs_submit_logged_extents(struct list_head *logged_list,
struct btrfs_root *log);
-void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid);
+void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log, u64 transid);
void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
int __init ordered_data_init(void);
void ordered_data_exit(void);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index efa0831..4325bb0 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3310,6 +3310,50 @@ out:
scrub_pending_trans_workers_dec(sctx);
}
+static int check_extent_to_block(struct inode *inode, u64 start, u64 len,
+ u64 logical)
+{
+ struct extent_state *cached_state = NULL;
+ struct btrfs_ordered_extent *ordered;
+ struct extent_io_tree *io_tree;
+ struct extent_map *em;
+ u64 lockstart = start, lockend = start + len - 1;
+ int ret = 0;
+
+ io_tree = &BTRFS_I(inode)->io_tree;
+
+ lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state);
+ ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
+ if (ordered) {
+ btrfs_put_ordered_extent(ordered);
+ ret = 1;
+ goto out_unlock;
+ }
+
+ em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto out_unlock;
+ }
+
+ /*
+ * This extent does not actually cover the logical extent anymore,
+ * move on to the next inode.
+ */
+ if (em->block_start > logical ||
+ em->block_start + em->block_len < logical + len) {
+ free_extent_map(em);
+ ret = 1;
+ goto out_unlock;
+ }
+ free_extent_map(em);
+
+out_unlock:
+ unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
+ GFP_NOFS);
+ return ret;
+}
+
static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
struct scrub_copy_nocow_ctx *nocow_ctx)
{
@@ -3318,13 +3362,10 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
struct inode *inode;
struct page *page;
struct btrfs_root *local_root;
- struct btrfs_ordered_extent *ordered;
- struct extent_map *em;
- struct extent_state *cached_state = NULL;
struct extent_io_tree *io_tree;
u64 physical_for_dev_replace;
+ u64 nocow_ctx_logical;
u64 len = nocow_ctx->len;
- u64 lockstart = offset, lockend = offset + len - 1;
unsigned long index;
int srcu_index;
int ret = 0;
@@ -3356,30 +3397,13 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
io_tree = &BTRFS_I(inode)->io_tree;
+ nocow_ctx_logical = nocow_ctx->logical;
- lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state);
- ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
- if (ordered) {
- btrfs_put_ordered_extent(ordered);
- goto out_unlock;
- }
-
- em = btrfs_get_extent(inode, NULL, 0, lockstart, len, 0);
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
- goto out_unlock;
- }
-
- /*
- * This extent does not actually cover the logical extent anymore,
- * move on to the next inode.
- */
- if (em->block_start > nocow_ctx->logical ||
- em->block_start + em->block_len < nocow_ctx->logical + len) {
- free_extent_map(em);
- goto out_unlock;
+ ret = check_extent_to_block(inode, offset, len, nocow_ctx_logical);
+ if (ret) {
+ ret = ret > 0 ? 0 : ret;
+ goto out;
}
- free_extent_map(em);
while (len >= PAGE_CACHE_SIZE) {
index = offset >> PAGE_CACHE_SHIFT;
@@ -3396,7 +3420,7 @@ again:
goto next_page;
} else {
ClearPageError(page);
- err = extent_read_full_page_nolock(io_tree, page,
+ err = extent_read_full_page(io_tree, page,
btrfs_get_extent,
nocow_ctx->mirror_num);
if (err) {
@@ -3421,6 +3445,14 @@ again:
goto next_page;
}
}
+
+ ret = check_extent_to_block(inode, offset, len,
+ nocow_ctx_logical);
+ if (ret) {
+ ret = ret > 0 ? 0 : ret;
+ goto next_page;
+ }
+
err = write_page_nocow(nocow_ctx->sctx,
physical_for_dev_replace, page);
if (err)
@@ -3434,12 +3466,10 @@ next_page:
offset += PAGE_CACHE_SIZE;
physical_for_dev_replace += PAGE_CACHE_SIZE;
+ nocow_ctx_logical += PAGE_CACHE_SIZE;
len -= PAGE_CACHE_SIZE;
}
ret = COPY_COMPLETE;
-out_unlock:
- unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
- GFP_NOFS);
out:
mutex_unlock(&inode->i_mutex);
iput(inode);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 65c75d9..391ec44 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1652,8 +1652,20 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
int i = 0, nr_devices;
int ret;
+ /*
+ * We aren't under the device list lock, so this is racey-ish, but good
+ * enough for our purposes.
+ */
nr_devices = fs_info->fs_devices->open_devices;
- BUG_ON(!nr_devices);
+ if (!nr_devices) {
+ smp_mb();
+ nr_devices = fs_info->fs_devices->open_devices;
+ ASSERT(nr_devices);
+ if (!nr_devices) {
+ *free_bytes = 0;
+ return 0;
+ }
+ }
devices_info = kmalloc_array(nr_devices, sizeof(*devices_info),
GFP_NOFS);
@@ -1678,11 +1690,17 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
else
min_stripe_size = BTRFS_STRIPE_LEN;
- list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ if (fs_info->alloc_start)
+ mutex_lock(&fs_devices->device_list_mutex);
+ rcu_read_lock();
+ list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
if (!device->in_fs_metadata || !device->bdev ||
device->is_tgtdev_for_dev_replace)
continue;
+ if (i >= nr_devices)
+ break;
+
avail_space = device->total_bytes - device->bytes_used;
/* align with stripe_len */
@@ -1697,24 +1715,32 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
skip_space = 1024 * 1024;
/* user can set the offset in fs_info->alloc_start. */
- if (fs_info->alloc_start + BTRFS_STRIPE_LEN <=
- device->total_bytes)
+ if (fs_info->alloc_start &&
+ fs_info->alloc_start + BTRFS_STRIPE_LEN <=
+ device->total_bytes) {
+ rcu_read_unlock();
skip_space = max(fs_info->alloc_start, skip_space);
- /*
- * btrfs can not use the free space in [0, skip_space - 1],
- * we must subtract it from the total. In order to implement
- * it, we account the used space in this range first.
- */
- ret = btrfs_account_dev_extents_size(device, 0, skip_space - 1,
- &used_space);
- if (ret) {
- kfree(devices_info);
- return ret;
- }
+ /*
+ * btrfs can not use the free space in
+ * [0, skip_space - 1], we must subtract it from the
+ * total. In order to implement it, we account the used
+ * space in this range first.
+ */
+ ret = btrfs_account_dev_extents_size(device, 0,
+ skip_space - 1,
+ &used_space);
+ if (ret) {
+ kfree(devices_info);
+ mutex_unlock(&fs_devices->device_list_mutex);
+ return ret;
+ }
- /* calc the free space in [0, skip_space - 1] */
- skip_space -= used_space;
+ rcu_read_lock();
+
+ /* calc the free space in [0, skip_space - 1] */
+ skip_space -= used_space;
+ }
/*
* we can use the free space in [0, skip_space - 1], subtract
@@ -1733,6 +1759,9 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
i++;
}
+ rcu_read_unlock();
+ if (fs_info->alloc_start)
+ mutex_unlock(&fs_devices->device_list_mutex);
nr_devices = i;
@@ -1795,8 +1824,6 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
* holding chunk_muext to avoid allocating new chunks, holding
* device_list_mutex to avoid the device being removed
*/
- mutex_lock(&fs_info->fs_devices->device_list_mutex);
- mutex_lock(&fs_info->chunk_mutex);
rcu_read_lock();
list_for_each_entry_rcu(found, head, list) {
if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
@@ -1832,17 +1859,12 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_bfree -= block_rsv->size >> bits;
spin_unlock(&block_rsv->lock);
- buf->f_bavail = total_free_data;
+ buf->f_bavail = div_u64(total_free_data, factor);
ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data);
- if (ret) {
- mutex_unlock(&fs_info->chunk_mutex);
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ if (ret)
return ret;
- }
buf->f_bavail += div_u64(total_free_data, factor);
buf->f_bavail = buf->f_bavail >> bits;
- mutex_unlock(&fs_info->chunk_mutex);
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
buf->f_type = BTRFS_SUPER_MAGIC;
buf->f_bsize = dentry->d_sb->s_blocksize;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 9d8e2b8d..a605d4e 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -76,6 +76,32 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
}
}
+static void clear_btree_io_tree(struct extent_io_tree *tree)
+{
+ spin_lock(&tree->lock);
+ while (!RB_EMPTY_ROOT(&tree->state)) {
+ struct rb_node *node;
+ struct extent_state *state;
+
+ node = rb_first(&tree->state);
+ state = rb_entry(node, struct extent_state, rb_node);
+ rb_erase(&state->rb_node, &tree->state);
+ RB_CLEAR_NODE(&state->rb_node);
+ /*
+ * btree io trees aren't supposed to have tasks waiting for
+ * changes in the flags of extent states ever.
+ */
+ ASSERT(!waitqueue_active(&state->wq));
+ free_extent_state(state);
+ if (need_resched()) {
+ spin_unlock(&tree->lock);
+ cond_resched();
+ spin_lock(&tree->lock);
+ }
+ }
+ spin_unlock(&tree->lock);
+}
+
static noinline void switch_commit_roots(struct btrfs_transaction *trans,
struct btrfs_fs_info *fs_info)
{
@@ -89,6 +115,7 @@ static noinline void switch_commit_roots(struct btrfs_transaction *trans,
root->commit_root = btrfs_root_node(root);
if (is_fstree(root->objectid))
btrfs_unpin_free_ino(root);
+ clear_btree_io_tree(&root->dirty_log_pages);
}
up_write(&fs_info->commit_root_sem);
}
@@ -220,6 +247,7 @@ loop:
INIT_LIST_HEAD(&cur_trans->pending_snapshots);
INIT_LIST_HEAD(&cur_trans->pending_chunks);
INIT_LIST_HEAD(&cur_trans->switch_commits);
+ INIT_LIST_HEAD(&cur_trans->pending_ordered);
list_add_tail(&cur_trans->list, &fs_info->trans_list);
extent_io_tree_init(&cur_trans->dirty_pages,
fs_info->btree_inode->i_mapping);
@@ -488,6 +516,7 @@ again:
h->sync = false;
INIT_LIST_HEAD(&h->qgroup_ref_list);
INIT_LIST_HEAD(&h->new_bgs);
+ INIT_LIST_HEAD(&h->ordered);
smp_mb();
if (cur_trans->state >= TRANS_STATE_BLOCKED &&
@@ -719,6 +748,12 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
if (!list_empty(&trans->new_bgs))
btrfs_create_pending_block_groups(trans, root);
+ if (!list_empty(&trans->ordered)) {
+ spin_lock(&info->trans_lock);
+ list_splice(&trans->ordered, &cur_trans->pending_ordered);
+ spin_unlock(&info->trans_lock);
+ }
+
trans->delayed_ref_updates = 0;
if (!trans->sync) {
must_run_delayed_refs =
@@ -828,17 +863,39 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
while (!find_first_extent_bit(dirty_pages, start, &start, &end,
mark, &cached_state)) {
- convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
- mark, &cached_state, GFP_NOFS);
- cached_state = NULL;
- err = filemap_fdatawrite_range(mapping, start, end);
+ bool wait_writeback = false;
+
+ err = convert_extent_bit(dirty_pages, start, end,
+ EXTENT_NEED_WAIT,
+ mark, &cached_state, GFP_NOFS);
+ /*
+ * convert_extent_bit can return -ENOMEM, which is most of the
+ * time a temporary error. So when it happens, ignore the error
+ * and wait for writeback of this range to finish - because we
+ * failed to set the bit EXTENT_NEED_WAIT for the range, a call
+ * to btrfs_wait_marked_extents() would not know that writeback
+ * for this range started and therefore wouldn't wait for it to
+ * finish - we don't want to commit a superblock that points to
+ * btree nodes/leafs for which writeback hasn't finished yet
+ * (and without errors).
+ * We cleanup any entries left in the io tree when committing
+ * the transaction (through clear_btree_io_tree()).
+ */
+ if (err == -ENOMEM) {
+ err = 0;
+ wait_writeback = true;
+ }
+ if (!err)
+ err = filemap_fdatawrite_range(mapping, start, end);
if (err)
werr = err;
+ else if (wait_writeback)
+ werr = filemap_fdatawait_range(mapping, start, end);
+ free_extent_state(cached_state);
+ cached_state = NULL;
cond_resched();
start = end + 1;
}
- if (err)
- werr = err;
return werr;
}
@@ -862,11 +919,25 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
while (!find_first_extent_bit(dirty_pages, start, &start, &end,
EXTENT_NEED_WAIT, &cached_state)) {
- clear_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
- 0, 0, &cached_state, GFP_NOFS);
- err = filemap_fdatawait_range(mapping, start, end);
+ /*
+ * Ignore -ENOMEM errors returned by clear_extent_bit().
+ * When committing the transaction, we'll remove any entries
+ * left in the io tree. For a log commit, we don't remove them
+ * after committing the log because the tree can be accessed
+ * concurrently - we do it only at transaction commit time when
+ * it's safe to do it (through clear_btree_io_tree()).
+ */
+ err = clear_extent_bit(dirty_pages, start, end,
+ EXTENT_NEED_WAIT,
+ 0, 0, &cached_state, GFP_NOFS);
+ if (err == -ENOMEM)
+ err = 0;
+ if (!err)
+ err = filemap_fdatawait_range(mapping, start, end);
if (err)
werr = err;
+ free_extent_state(cached_state);
+ cached_state = NULL;
cond_resched();
start = end + 1;
}
@@ -919,17 +990,17 @@ static int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
return 0;
}
-int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
+static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
- if (!trans || !trans->transaction) {
- struct inode *btree_inode;
- btree_inode = root->fs_info->btree_inode;
- return filemap_write_and_wait(btree_inode->i_mapping);
- }
- return btrfs_write_and_wait_marked_extents(root,
+ int ret;
+
+ ret = btrfs_write_and_wait_marked_extents(root,
&trans->transaction->dirty_pages,
EXTENT_DIRTY);
+ clear_btree_io_tree(&trans->transaction->dirty_pages);
+
+ return ret;
}
/*
@@ -1652,6 +1723,28 @@ static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
btrfs_wait_ordered_roots(fs_info, -1);
}
+static inline void
+btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans,
+ struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_ordered_extent *ordered;
+
+ spin_lock(&fs_info->trans_lock);
+ while (!list_empty(&cur_trans->pending_ordered)) {
+ ordered = list_first_entry(&cur_trans->pending_ordered,
+ struct btrfs_ordered_extent,
+ trans_list);
+ list_del_init(&ordered->trans_list);
+ spin_unlock(&fs_info->trans_lock);
+
+ wait_event(ordered->wait, test_bit(BTRFS_ORDERED_COMPLETE,
+ &ordered->flags));
+ btrfs_put_ordered_extent(ordered);
+ spin_lock(&fs_info->trans_lock);
+ }
+ spin_unlock(&fs_info->trans_lock);
+}
+
int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
@@ -1702,6 +1795,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
}
spin_lock(&root->fs_info->trans_lock);
+ list_splice(&trans->ordered, &cur_trans->pending_ordered);
if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
spin_unlock(&root->fs_info->trans_lock);
atomic_inc(&cur_trans->use_count);
@@ -1754,6 +1848,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
btrfs_wait_delalloc_flush(root->fs_info);
+ btrfs_wait_pending_ordered(cur_trans, root->fs_info);
+
btrfs_scrub_pause(root);
/*
* Ok now we need to make sure to block out any other joins while we
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 75ebcfc..00ed29c 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -56,6 +56,7 @@ struct btrfs_transaction {
wait_queue_head_t commit_wait;
struct list_head pending_snapshots;
struct list_head pending_chunks;
+ struct list_head pending_ordered;
struct list_head switch_commits;
struct btrfs_delayed_ref_root delayed_refs;
int aborted;
@@ -105,6 +106,7 @@ struct btrfs_trans_handle {
*/
struct btrfs_root *root;
struct seq_list delayed_ref_elem;
+ struct list_head ordered;
struct list_head qgroup_ref_list;
struct list_head new_bgs;
};
@@ -145,8 +147,6 @@ struct btrfs_trans_handle *btrfs_attach_transaction_barrier(
struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root);
int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
-int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
- struct btrfs_root *root);
void btrfs_add_dead_root(struct btrfs_root *root);
int btrfs_defrag_root(struct btrfs_root *root);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 286213c..9a02da1 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2599,12 +2599,14 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
index2 = root_log_ctx.log_transid % 2;
if (atomic_read(&log_root_tree->log_commit[index2])) {
blk_finish_plug(&plug);
- btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+ ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages,
+ mark);
+ btrfs_wait_logged_extents(trans, log, log_transid);
wait_log_commit(trans, log_root_tree,
root_log_ctx.log_transid);
- btrfs_free_logged_extents(log, log_transid);
mutex_unlock(&log_root_tree->log_mutex);
- ret = root_log_ctx.log_ret;
+ if (!ret)
+ ret = root_log_ctx.log_ret;
goto out;
}
ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
@@ -2641,11 +2643,18 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
mutex_unlock(&log_root_tree->log_mutex);
goto out_wake_log_root;
}
- btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
- btrfs_wait_marked_extents(log_root_tree,
- &log_root_tree->dirty_log_pages,
- EXTENT_NEW | EXTENT_DIRTY);
- btrfs_wait_logged_extents(log, log_transid);
+ ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+ if (!ret)
+ ret = btrfs_wait_marked_extents(log_root_tree,
+ &log_root_tree->dirty_log_pages,
+ EXTENT_NEW | EXTENT_DIRTY);
+ if (ret) {
+ btrfs_set_log_full_commit(root->fs_info, trans);
+ btrfs_free_logged_extents(log, log_transid);
+ mutex_unlock(&log_root_tree->log_mutex);
+ goto out_wake_log_root;
+ }
+ btrfs_wait_logged_extents(trans, log, log_transid);
btrfs_set_super_log_root(root->fs_info->super_for_commit,
log_root_tree->node->start);
@@ -3626,6 +3635,12 @@ static int wait_ordered_extents(struct btrfs_trans_handle *trans,
test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)));
if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) {
+ /*
+ * Clear the AS_EIO/AS_ENOSPC flags from the inode's
+ * i_mapping flags, so that the next fsync won't get
+ * an outdated io error too.
+ */
+ btrfs_inode_check_errors(inode);
*ordered_io_error = true;
break;
}
@@ -3766,7 +3781,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
- btrfs_set_token_file_extent_generation(leaf, fi, em->generation,
+ btrfs_set_token_file_extent_generation(leaf, fi, trans->transid,
&token);
if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
btrfs_set_token_file_extent_type(leaf, fi,
@@ -3963,7 +3978,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
mutex_lock(&BTRFS_I(inode)->log_mutex);
- btrfs_get_logged_extents(inode, &logged_list);
+ btrfs_get_logged_extents(inode, &logged_list, start, end);
/*
* a brute force approach to making sure we get the most uptodate
@@ -4089,6 +4104,21 @@ log_extents:
btrfs_release_path(path);
btrfs_release_path(dst_path);
if (fast_search) {
+ /*
+ * Some ordered extents started by fsync might have completed
+ * before we collected the ordered extents in logged_list, which
+ * means they're gone, not in our logged_list nor in the inode's
+ * ordered tree. We want the application/user space to know an
+ * error happened while attempting to persist file data so that
+ * it can take proper action. If such error happened, we leave
+ * without writing to the log tree and the fsync must report the
+ * file data write error and not commit the current transaction.
+ */
+ err = btrfs_inode_check_errors(inode);
+ if (err) {
+ ctx->io_err = err;
+ goto out_unlock;
+ }
ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
&logged_list, ctx);
if (ret) {
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index dcf2013..47b1946 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -29,6 +29,7 @@
#include "xattr.h"
#include "disk-io.h"
#include "props.h"
+#include "locking.h"
ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
@@ -91,7 +92,7 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
struct inode *inode, const char *name,
const void *value, size_t size, int flags)
{
- struct btrfs_dir_item *di;
+ struct btrfs_dir_item *di = NULL;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_path *path;
size_t name_len = strlen(name);
@@ -103,84 +104,119 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
+ path->skip_release_on_error = 1;
+
+ if (!value) {
+ di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode),
+ name, name_len, -1);
+ if (!di && (flags & XATTR_REPLACE))
+ ret = -ENODATA;
+ else if (di)
+ ret = btrfs_delete_one_dir_name(trans, root, path, di);
+ goto out;
+ }
+ /*
+ * For a replace we can't just do the insert blindly.
+ * Do a lookup first (read-only btrfs_search_slot), and return if xattr
+ * doesn't exist. If it exists, fall down below to the insert/replace
+ * path - we can't race with a concurrent xattr delete, because the VFS
+ * locks the inode's i_mutex before calling setxattr or removexattr.
+ */
if (flags & XATTR_REPLACE) {
- di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), name,
- name_len, -1);
- if (IS_ERR(di)) {
- ret = PTR_ERR(di);
- goto out;
- } else if (!di) {
+ ASSERT(mutex_is_locked(&inode->i_mutex));
+ di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode),
+ name, name_len, 0);
+ if (!di) {
ret = -ENODATA;
goto out;
}
- ret = btrfs_delete_one_dir_name(trans, root, path, di);
- if (ret)
- goto out;
btrfs_release_path(path);
+ di = NULL;
+ }
+ ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
+ name, name_len, value, size);
+ if (ret == -EOVERFLOW) {
/*
- * remove the attribute
+ * We have an existing item in a leaf, split_leaf couldn't
+ * expand it. That item might have or not a dir_item that
+ * matches our target xattr, so lets check.
*/
- if (!value)
- goto out;
- } else {
- di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode),
- name, name_len, 0);
- if (IS_ERR(di)) {
- ret = PTR_ERR(di);
+ ret = 0;
+ btrfs_assert_tree_locked(path->nodes[0]);
+ di = btrfs_match_dir_item_name(root, path, name, name_len);
+ if (!di && !(flags & XATTR_REPLACE)) {
+ ret = -ENOSPC;
goto out;
}
- if (!di && !value)
- goto out;
- btrfs_release_path(path);
+ } else if (ret == -EEXIST) {
+ ret = 0;
+ di = btrfs_match_dir_item_name(root, path, name, name_len);
+ ASSERT(di); /* logic error */
+ } else if (ret) {
+ goto out;
}
-again:
- ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
- name, name_len, value, size);
- /*
- * If we're setting an xattr to a new value but the new value is say
- * exactly BTRFS_MAX_XATTR_SIZE, we could end up with EOVERFLOW getting
- * back from split_leaf. This is because it thinks we'll be extending
- * the existing item size, but we're asking for enough space to add the
- * item itself. So if we get EOVERFLOW just set ret to EEXIST and let
- * the rest of the function figure it out.
- */
- if (ret == -EOVERFLOW)
+ if (di && (flags & XATTR_CREATE)) {
ret = -EEXIST;
+ goto out;
+ }
- if (ret == -EEXIST) {
- if (flags & XATTR_CREATE)
- goto out;
+ if (di) {
/*
- * We can't use the path we already have since we won't have the
- * proper locking for a delete, so release the path and
- * re-lookup to delete the thing.
+ * We're doing a replace, and it must be atomic, that is, at
+ * any point in time we have either the old or the new xattr
+ * value in the tree. We don't want readers (getxattr and
+ * listxattrs) to miss a value, this is specially important
+ * for ACLs.
*/
- btrfs_release_path(path);
- di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode),
- name, name_len, -1);
- if (IS_ERR(di)) {
- ret = PTR_ERR(di);
- goto out;
- } else if (!di) {
- /* Shouldn't happen but just in case... */
- btrfs_release_path(path);
- goto again;
+ const int slot = path->slots[0];
+ struct extent_buffer *leaf = path->nodes[0];
+ const u16 old_data_len = btrfs_dir_data_len(leaf, di);
+ const u32 item_size = btrfs_item_size_nr(leaf, slot);
+ const u32 data_size = sizeof(*di) + name_len + size;
+ struct btrfs_item *item;
+ unsigned long data_ptr;
+ char *ptr;
+
+ if (size > old_data_len) {
+ if (btrfs_leaf_free_space(root, leaf) <
+ (size - old_data_len)) {
+ ret = -ENOSPC;
+ goto out;
+ }
}
- ret = btrfs_delete_one_dir_name(trans, root, path, di);
- if (ret)
- goto out;
+ if (old_data_len + name_len + sizeof(*di) == item_size) {
+ /* No other xattrs packed in the same leaf item. */
+ if (size > old_data_len)
+ btrfs_extend_item(root, path,
+ size - old_data_len);
+ else if (size < old_data_len)
+ btrfs_truncate_item(root, path, data_size, 1);
+ } else {
+ /* There are other xattrs packed in the same item. */
+ ret = btrfs_delete_one_dir_name(trans, root, path, di);
+ if (ret)
+ goto out;
+ btrfs_extend_item(root, path, data_size);
+ }
+ item = btrfs_item_nr(slot);
+ ptr = btrfs_item_ptr(leaf, slot, char);
+ ptr += btrfs_item_size(leaf, item) - data_size;
+ di = (struct btrfs_dir_item *)ptr;
+ btrfs_set_dir_data_len(leaf, di, size);
+ data_ptr = ((unsigned long)(di + 1)) + name_len;
+ write_extent_buffer(leaf, value, data_ptr, size);
+ btrfs_mark_buffer_dirty(leaf);
+ } else {
/*
- * We have a value to set, so go back and try to insert it now.
+ * Insert, and we had space for the xattr, so path->slots[0] is
+ * where our xattr dir_item is and btrfs_insert_xattr_item()
+ * filled it.
*/
- if (value) {
- btrfs_release_path(path);
- goto again;
- }
}
out:
btrfs_free_path(path);
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 659f2ea..cefca66 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2638,7 +2638,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
for (i = 0; i < CEPH_CAP_BITS; i++)
if ((dirty & (1 << i)) &&
- flush_tid == ci->i_cap_flush_tid[i])
+ (u16)flush_tid == ci->i_cap_flush_tid[i])
cleaned |= 1 << i;
dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 5228f20..4f46f7a 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -378,7 +378,7 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync)
loff_t offset = header->args.offset;
size_t count = header->args.count;
struct page **pages = header->args.pages;
- int pg_index = pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
+ int pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
unsigned int pg_len;
struct blk_plug plug;
int i;
diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c
index e966c02..acbf9ca 100644
--- a/fs/nfs/blocklayout/rpc_pipefs.c
+++ b/fs/nfs/blocklayout/rpc_pipefs.c
@@ -65,17 +65,18 @@ bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b,
dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
+ mutex_lock(&nn->bl_mutex);
bl_pipe_msg.bl_wq = &nn->bl_wq;
b->simple.len += 4; /* single volume */
if (b->simple.len > PAGE_SIZE)
- return -EIO;
+ goto out_unlock;
memset(msg, 0, sizeof(*msg));
msg->len = sizeof(*bl_msg) + b->simple.len;
msg->data = kzalloc(msg->len, gfp_mask);
if (!msg->data)
- goto out;
+ goto out_free_data;
bl_msg = msg->data;
bl_msg->type = BL_DEVICE_MOUNT,
@@ -87,7 +88,7 @@ bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b,
rc = rpc_queue_upcall(nn->bl_device_pipe, msg);
if (rc < 0) {
remove_wait_queue(&nn->bl_wq, &wq);
- goto out;
+ goto out_free_data;
}
set_current_state(TASK_UNINTERRUPTIBLE);
@@ -97,12 +98,14 @@ bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b,
if (reply->status != BL_DEVICE_REQUEST_PROC) {
printk(KERN_WARNING "%s failed to decode device: %d\n",
__func__, reply->status);
- goto out;
+ goto out_free_data;
}
dev = MKDEV(reply->major, reply->minor);
-out:
+out_free_data:
kfree(msg->data);
+out_unlock:
+ mutex_unlock(&nn->bl_mutex);
return dev;
}
@@ -232,6 +235,7 @@ static int nfs4blocklayout_net_init(struct net *net)
struct nfs_net *nn = net_generic(net, nfs_net_id);
struct dentry *dentry;
+ mutex_init(&nn->bl_mutex);
init_waitqueue_head(&nn->bl_wq);
nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0);
if (IS_ERR(nn->bl_device_pipe))
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 5853f53..7f3f606 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -125,6 +125,8 @@ again:
continue;
if (!test_bit(NFS_DELEGATED_STATE, &state->flags))
continue;
+ if (!nfs4_valid_open_stateid(state))
+ continue;
if (!nfs4_stateid_match(&state->stateid, stateid))
continue;
get_nfs_open_context(ctx);
@@ -193,7 +195,11 @@ static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *
{
int res = 0;
- res = nfs4_proc_delegreturn(inode, delegation->cred, &delegation->stateid, issync);
+ if (!test_bit(NFS_DELEGATION_REVOKED, &delegation->flags))
+ res = nfs4_proc_delegreturn(inode,
+ delegation->cred,
+ &delegation->stateid,
+ issync);
nfs_free_delegation(delegation);
return res;
}
@@ -380,11 +386,13 @@ static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation
{
struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
struct nfs_inode *nfsi = NFS_I(inode);
- int err;
+ int err = 0;
if (delegation == NULL)
return 0;
do {
+ if (test_bit(NFS_DELEGATION_REVOKED, &delegation->flags))
+ break;
err = nfs_delegation_claim_opens(inode, &delegation->stateid);
if (!issync || err != -EAGAIN)
break;
@@ -605,10 +613,23 @@ static void nfs_client_mark_return_unused_delegation_types(struct nfs_client *cl
rcu_read_unlock();
}
+static void nfs_revoke_delegation(struct inode *inode)
+{
+ struct nfs_delegation *delegation;
+ rcu_read_lock();
+ delegation = rcu_dereference(NFS_I(inode)->delegation);
+ if (delegation != NULL) {
+ set_bit(NFS_DELEGATION_REVOKED, &delegation->flags);
+ nfs_mark_return_delegation(NFS_SERVER(inode), delegation);
+ }
+ rcu_read_unlock();
+}
+
void nfs_remove_bad_delegation(struct inode *inode)
{
struct nfs_delegation *delegation;
+ nfs_revoke_delegation(inode);
delegation = nfs_inode_detach_delegation(inode);
if (delegation) {
nfs_inode_find_state_and_recover(inode, &delegation->stateid);
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 5c1cce3..e3c20a3 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -31,6 +31,7 @@ enum {
NFS_DELEGATION_RETURN_IF_CLOSED,
NFS_DELEGATION_REFERENCED,
NFS_DELEGATION_RETURNING,
+ NFS_DELEGATION_REVOKED,
};
int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 06e8cfc..6e62155 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1527,6 +1527,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
case -ENOENT:
d_drop(dentry);
d_add(dentry, NULL);
+ nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
break;
case -EISDIR:
case -ENOTDIR:
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 20cffc8..10bf072 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -266,6 +266,7 @@ static void nfs_direct_req_free(struct kref *kref)
{
struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref);
+ nfs_free_pnfs_ds_cinfo(&dreq->ds_cinfo);
if (dreq->l_ctx != NULL)
nfs_put_lock_context(dreq->l_ctx);
if (dreq->ctx != NULL)
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 46fab1cb..7afb52f 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -145,9 +145,6 @@ static int filelayout_async_handle_error(struct rpc_task *task,
case -NFS4ERR_DELEG_REVOKED:
case -NFS4ERR_ADMIN_REVOKED:
case -NFS4ERR_BAD_STATEID:
- if (state == NULL)
- break;
- nfs_remove_bad_delegation(state->inode);
case -NFS4ERR_OPENMODE:
if (state == NULL)
break;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 6388a59..00689a8 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -626,7 +626,7 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
{
struct inode *inode = dentry->d_inode;
int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME;
- int err;
+ int err = 0;
trace_nfs_getattr_enter(inode);
/* Flush out writes to the server in order to update c/mtime. */
diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h
index ef221fb..f0e06e4 100644
--- a/fs/nfs/netns.h
+++ b/fs/nfs/netns.h
@@ -19,6 +19,7 @@ struct nfs_net {
struct rpc_pipe *bl_device_pipe;
struct bl_dev_msg bl_mount_reply;
wait_queue_head_t bl_wq;
+ struct mutex bl_mutex;
struct list_head nfs_client_list;
struct list_head nfs_volume_list;
#if IS_ENABLED(CONFIG_NFS_V4)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 405bd95..69dc20a 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -370,11 +370,6 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc
case -NFS4ERR_DELEG_REVOKED:
case -NFS4ERR_ADMIN_REVOKED:
case -NFS4ERR_BAD_STATEID:
- if (inode != NULL && nfs4_have_delegation(inode, FMODE_READ)) {
- nfs_remove_bad_delegation(inode);
- exception->retry = 1;
- break;
- }
if (state == NULL)
break;
ret = nfs4_schedule_stateid_recovery(server, state);
@@ -1654,7 +1649,7 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct
nfs_inode_find_state_and_recover(state->inode,
stateid);
nfs4_schedule_stateid_recovery(server, state);
- return 0;
+ return -EAGAIN;
case -NFS4ERR_DELAY:
case -NFS4ERR_GRACE:
set_bit(NFS_DELEGATED_STATE, &state->flags);
@@ -2109,46 +2104,60 @@ static int nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *sta
return ret;
}
+static void nfs_finish_clear_delegation_stateid(struct nfs4_state *state)
+{
+ nfs_remove_bad_delegation(state->inode);
+ write_seqlock(&state->seqlock);
+ nfs4_stateid_copy(&state->stateid, &state->open_stateid);
+ write_sequnlock(&state->seqlock);
+ clear_bit(NFS_DELEGATED_STATE, &state->flags);
+}
+
+static void nfs40_clear_delegation_stateid(struct nfs4_state *state)
+{
+ if (rcu_access_pointer(NFS_I(state->inode)->delegation) != NULL)
+ nfs_finish_clear_delegation_stateid(state);
+}
+
+static int nfs40_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state)
+{
+ /* NFSv4.0 doesn't allow for delegation recovery on open expire */
+ nfs40_clear_delegation_stateid(state);
+ return nfs4_open_expired(sp, state);
+}
+
#if defined(CONFIG_NFS_V4_1)
-static void nfs41_clear_delegation_stateid(struct nfs4_state *state)
+static void nfs41_check_delegation_stateid(struct nfs4_state *state)
{
struct nfs_server *server = NFS_SERVER(state->inode);
- nfs4_stateid *stateid = &state->stateid;
+ nfs4_stateid stateid;
struct nfs_delegation *delegation;
- struct rpc_cred *cred = NULL;
- int status = -NFS4ERR_BAD_STATEID;
-
- /* If a state reset has been done, test_stateid is unneeded */
- if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
- return;
+ struct rpc_cred *cred;
+ int status;
/* Get the delegation credential for use by test/free_stateid */
rcu_read_lock();
delegation = rcu_dereference(NFS_I(state->inode)->delegation);
- if (delegation != NULL &&
- nfs4_stateid_match(&delegation->stateid, stateid)) {
- cred = get_rpccred(delegation->cred);
- rcu_read_unlock();
- status = nfs41_test_stateid(server, stateid, cred);
- trace_nfs4_test_delegation_stateid(state, NULL, status);
- } else
+ if (delegation == NULL) {
rcu_read_unlock();
+ return;
+ }
+
+ nfs4_stateid_copy(&stateid, &delegation->stateid);
+ cred = get_rpccred(delegation->cred);
+ rcu_read_unlock();
+ status = nfs41_test_stateid(server, &stateid, cred);
+ trace_nfs4_test_delegation_stateid(state, NULL, status);
if (status != NFS_OK) {
/* Free the stateid unless the server explicitly
* informs us the stateid is unrecognized. */
if (status != -NFS4ERR_BAD_STATEID)
- nfs41_free_stateid(server, stateid, cred);
- nfs_remove_bad_delegation(state->inode);
-
- write_seqlock(&state->seqlock);
- nfs4_stateid_copy(&state->stateid, &state->open_stateid);
- write_sequnlock(&state->seqlock);
- clear_bit(NFS_DELEGATED_STATE, &state->flags);
+ nfs41_free_stateid(server, &stateid, cred);
+ nfs_finish_clear_delegation_stateid(state);
}
- if (cred != NULL)
- put_rpccred(cred);
+ put_rpccred(cred);
}
/**
@@ -2192,7 +2201,7 @@ static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st
{
int status;
- nfs41_clear_delegation_stateid(state);
+ nfs41_check_delegation_stateid(state);
status = nfs41_check_open_stateid(state);
if (status != NFS_OK)
status = nfs4_open_expired(sp, state);
@@ -2231,19 +2240,8 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
ret = _nfs4_proc_open(opendata);
- if (ret != 0) {
- if (ret == -ENOENT) {
- dentry = opendata->dentry;
- if (dentry->d_inode)
- d_delete(dentry);
- else if (d_unhashed(dentry))
- d_add(dentry, NULL);
-
- nfs_set_verifier(dentry,
- nfs_save_change_attribute(opendata->dir->d_inode));
- }
+ if (ret != 0)
goto out;
- }
state = nfs4_opendata_to_nfs4_state(opendata);
ret = PTR_ERR(state);
@@ -4841,9 +4839,6 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
case -NFS4ERR_DELEG_REVOKED:
case -NFS4ERR_ADMIN_REVOKED:
case -NFS4ERR_BAD_STATEID:
- if (state == NULL)
- break;
- nfs_remove_bad_delegation(state->inode);
case -NFS4ERR_OPENMODE:
if (state == NULL)
break;
@@ -8341,7 +8336,7 @@ static const struct nfs4_state_recovery_ops nfs41_reboot_recovery_ops = {
static const struct nfs4_state_recovery_ops nfs40_nograce_recovery_ops = {
.owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE,
.state_flag_bit = NFS_STATE_RECLAIM_NOGRACE,
- .recover_open = nfs4_open_expired,
+ .recover_open = nfs40_open_expired,
.recover_lock = nfs4_lock_expired,
.establish_clid = nfs4_init_clientid,
};
@@ -8408,8 +8403,7 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
| NFS_CAP_CHANGE_ATTR
| NFS_CAP_POSIX_LOCK
| NFS_CAP_STATEID_NFSV41
- | NFS_CAP_ATOMIC_OPEN_V1
- | NFS_CAP_SEEK,
+ | NFS_CAP_ATOMIC_OPEN_V1,
.init_client = nfs41_init_client,
.shutdown_client = nfs41_shutdown_client,
.match_stateid = nfs41_match_stateid,
@@ -8431,7 +8425,8 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
| NFS_CAP_CHANGE_ATTR
| NFS_CAP_POSIX_LOCK
| NFS_CAP_STATEID_NFSV41
- | NFS_CAP_ATOMIC_OPEN_V1,
+ | NFS_CAP_ATOMIC_OPEN_V1
+ | NFS_CAP_SEEK,
.init_client = nfs41_init_client,
.shutdown_client = nfs41_shutdown_client,
.match_stateid = nfs41_match_stateid,
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 1249384..f83b02d 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -715,8 +715,6 @@ static void nfs_inode_remove_request(struct nfs_page *req)
if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags))
nfs_release_request(req);
- else
- WARN_ON_ONCE(1);
}
static void
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 9d3e9c5..89326ac 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -229,8 +229,16 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
&fsnotify_mark_srcu);
}
+ /*
+ * We need to merge inode & vfsmount mark lists so that inode mark
+ * ignore masks are properly reflected for mount mark notifications.
+ * That's why this traversal is so complicated...
+ */
while (inode_node || vfsmount_node) {
- inode_group = vfsmount_group = NULL;
+ inode_group = NULL;
+ inode_mark = NULL;
+ vfsmount_group = NULL;
+ vfsmount_mark = NULL;
if (inode_node) {
inode_mark = hlist_entry(srcu_dereference(inode_node, &fsnotify_mark_srcu),
@@ -244,21 +252,19 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
vfsmount_group = vfsmount_mark->group;
}
- if (inode_group > vfsmount_group) {
- /* handle inode */
- ret = send_to_group(to_tell, inode_mark, NULL, mask,
- data, data_is, cookie, file_name);
- /* we didn't use the vfsmount_mark */
- vfsmount_group = NULL;
- } else if (vfsmount_group > inode_group) {
- ret = send_to_group(to_tell, NULL, vfsmount_mark, mask,
- data, data_is, cookie, file_name);
- inode_group = NULL;
- } else {
- ret = send_to_group(to_tell, inode_mark, vfsmount_mark,
- mask, data, data_is, cookie,
- file_name);
+ if (inode_group && vfsmount_group) {
+ int cmp = fsnotify_compare_groups(inode_group,
+ vfsmount_group);
+ if (cmp > 0) {
+ inode_group = NULL;
+ inode_mark = NULL;
+ } else if (cmp < 0) {
+ vfsmount_group = NULL;
+ vfsmount_mark = NULL;
+ }
}
+ ret = send_to_group(to_tell, inode_mark, vfsmount_mark, mask,
+ data, data_is, cookie, file_name);
if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS))
goto out;
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 9c0898c..3b68b0a 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -12,6 +12,10 @@ extern void fsnotify_flush_notify(struct fsnotify_group *group);
/* protects reads of inode and vfsmount marks list */
extern struct srcu_struct fsnotify_mark_srcu;
+/* compare two groups for sorting of marks lists */
+extern int fsnotify_compare_groups(struct fsnotify_group *a,
+ struct fsnotify_group *b);
+
extern void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *fsn_mark,
__u32 mask);
/* add a mark to an inode */
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index e849714..dfbf544 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -194,6 +194,7 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
{
struct fsnotify_mark *lmark, *last = NULL;
int ret = 0;
+ int cmp;
mark->flags |= FSNOTIFY_MARK_FLAG_INODE;
@@ -219,11 +220,8 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
goto out;
}
- if (mark->group->priority < lmark->group->priority)
- continue;
-
- if ((mark->group->priority == lmark->group->priority) &&
- (mark->group < lmark->group))
+ cmp = fsnotify_compare_groups(lmark->group, mark->group);
+ if (cmp < 0)
continue;
hlist_add_before_rcu(&mark->i.i_list, &lmark->i.i_list);
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index d90deaa..34c38fa 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -210,6 +210,42 @@ void fsnotify_set_mark_ignored_mask_locked(struct fsnotify_mark *mark, __u32 mas
}
/*
+ * Sorting function for lists of fsnotify marks.
+ *
+ * Fanotify supports different notification classes (reflected as priority of
+ * notification group). Events shall be passed to notification groups in
+ * decreasing priority order. To achieve this marks in notification lists for
+ * inodes and vfsmounts are sorted so that priorities of corresponding groups
+ * are descending.
+ *
+ * Furthermore correct handling of the ignore mask requires processing inode
+ * and vfsmount marks of each group together. Using the group address as
+ * further sort criterion provides a unique sorting order and thus we can
+ * merge inode and vfsmount lists of marks in linear time and find groups
+ * present in both lists.
+ *
+ * A return value of 1 signifies that b has priority over a.
+ * A return value of 0 signifies that the two marks have to be handled together.
+ * A return value of -1 signifies that a has priority over b.
+ */
+int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b)
+{
+ if (a == b)
+ return 0;
+ if (!a)
+ return 1;
+ if (!b)
+ return -1;
+ if (a->priority < b->priority)
+ return 1;
+ if (a->priority > b->priority)
+ return -1;
+ if (a < b)
+ return 1;
+ return -1;
+}
+
+/*
* Attach an initialized mark to a given group and fs object.
* These marks may be used for the fsnotify backend to determine which
* event types should be delivered to which group.
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
index ac851e8..faefa72 100644
--- a/fs/notify/vfsmount_mark.c
+++ b/fs/notify/vfsmount_mark.c
@@ -153,6 +153,7 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
struct mount *m = real_mount(mnt);
struct fsnotify_mark *lmark, *last = NULL;
int ret = 0;
+ int cmp;
mark->flags |= FSNOTIFY_MARK_FLAG_VFSMOUNT;
@@ -178,11 +179,8 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
goto out;
}
- if (mark->group->priority < lmark->group->priority)
- continue;
-
- if ((mark->group->priority == lmark->group->priority) &&
- (mark->group < lmark->group))
+ cmp = fsnotify_compare_groups(lmark->group, mark->group);
+ if (cmp < 0)
continue;
hlist_add_before_rcu(&mark->m.m_list, &lmark->m.m_list);
OpenPOWER on IntegriCloud