diff options
44 files changed, 2768 insertions, 1068 deletions
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 9a2ec79..6dcdb2e 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -362,6 +362,12 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, goto out; } + if (btrfs_test_is_dummy_root(root)) { + srcu_read_unlock(&fs_info->subvol_srcu, index); + ret = -ENOENT; + goto out; + } + if (path->search_commit_root) root_level = btrfs_header_level(root->commit_root); else if (time_seq == (u64)-1) diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 541fbfa..0340c57 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -667,7 +667,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state, selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS); if (NULL == selected_super) { printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); - return -1; + return -ENOMEM; } list_for_each_entry(device, dev_head, dev_list) { @@ -845,8 +845,8 @@ static int btrfsic_process_superblock_dev_mirror( superblock_tmp->never_written = 0; superblock_tmp->mirror_num = 1 + superblock_mirror_num; if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) - printk_in_rcu(KERN_INFO "New initial S-block (bdev %p, %s)" - " @%llu (%s/%llu/%d)\n", + btrfs_info_in_rcu(device->dev_root->fs_info, + "new initial S-block (bdev %p, %s) @%llu (%s/%llu/%d)", superblock_bdev, rcu_str_deref(device->name), dev_bytenr, dev_state->name, dev_bytenr, @@ -1660,7 +1660,7 @@ static int btrfsic_read_block(struct btrfsic_state *state, sizeof(*block_ctx->pagev)) * num_pages, GFP_NOFS); if (!block_ctx->mem_to_free) - return -1; + return -ENOMEM; block_ctx->datav = block_ctx->mem_to_free; block_ctx->pagev = (struct page **)(block_ctx->datav + num_pages); for (i = 0; i < num_pages; i++) { diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 57ee8ca..97b049a 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -745,11 +745,13 @@ out: return ret; } -static struct list_head comp_idle_workspace[BTRFS_COMPRESS_TYPES]; -static spinlock_t comp_workspace_lock[BTRFS_COMPRESS_TYPES]; -static int comp_num_workspace[BTRFS_COMPRESS_TYPES]; -static atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES]; -static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES]; +static struct { + struct list_head idle_ws; + spinlock_t ws_lock; + int num_ws; + atomic_t alloc_ws; + wait_queue_head_t ws_wait; +} btrfs_comp_ws[BTRFS_COMPRESS_TYPES]; static const struct btrfs_compress_op * const btrfs_compress_op[] = { &btrfs_zlib_compress, @@ -761,10 +763,10 @@ void __init btrfs_init_compress(void) int i; for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) { - INIT_LIST_HEAD(&comp_idle_workspace[i]); - spin_lock_init(&comp_workspace_lock[i]); - atomic_set(&comp_alloc_workspace[i], 0); - init_waitqueue_head(&comp_workspace_wait[i]); + INIT_LIST_HEAD(&btrfs_comp_ws[i].idle_ws); + spin_lock_init(&btrfs_comp_ws[i].ws_lock); + atomic_set(&btrfs_comp_ws[i].alloc_ws, 0); + init_waitqueue_head(&btrfs_comp_ws[i].ws_wait); } } @@ -778,38 +780,38 @@ static struct list_head *find_workspace(int type) int cpus = num_online_cpus(); int idx = type - 1; - struct list_head *idle_workspace = &comp_idle_workspace[idx]; - spinlock_t *workspace_lock = &comp_workspace_lock[idx]; - atomic_t *alloc_workspace = &comp_alloc_workspace[idx]; - wait_queue_head_t *workspace_wait = &comp_workspace_wait[idx]; - int *num_workspace = &comp_num_workspace[idx]; + struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws; + spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock; + atomic_t *alloc_ws = &btrfs_comp_ws[idx].alloc_ws; + wait_queue_head_t *ws_wait = &btrfs_comp_ws[idx].ws_wait; + int *num_ws = &btrfs_comp_ws[idx].num_ws; again: - spin_lock(workspace_lock); - if (!list_empty(idle_workspace)) { - workspace = idle_workspace->next; + spin_lock(ws_lock); + if (!list_empty(idle_ws)) { + workspace = idle_ws->next; list_del(workspace); - (*num_workspace)--; - spin_unlock(workspace_lock); + (*num_ws)--; + spin_unlock(ws_lock); return workspace; } - if (atomic_read(alloc_workspace) > cpus) { + if (atomic_read(alloc_ws) > cpus) { DEFINE_WAIT(wait); - spin_unlock(workspace_lock); - prepare_to_wait(workspace_wait, &wait, TASK_UNINTERRUPTIBLE); - if (atomic_read(alloc_workspace) > cpus && !*num_workspace) + spin_unlock(ws_lock); + prepare_to_wait(ws_wait, &wait, TASK_UNINTERRUPTIBLE); + if (atomic_read(alloc_ws) > cpus && !*num_ws) schedule(); - finish_wait(workspace_wait, &wait); + finish_wait(ws_wait, &wait); goto again; } - atomic_inc(alloc_workspace); - spin_unlock(workspace_lock); + atomic_inc(alloc_ws); + spin_unlock(ws_lock); workspace = btrfs_compress_op[idx]->alloc_workspace(); if (IS_ERR(workspace)) { - atomic_dec(alloc_workspace); - wake_up(workspace_wait); + atomic_dec(alloc_ws); + wake_up(ws_wait); } return workspace; } @@ -821,27 +823,30 @@ again: static void free_workspace(int type, struct list_head *workspace) { int idx = type - 1; - struct list_head *idle_workspace = &comp_idle_workspace[idx]; - spinlock_t *workspace_lock = &comp_workspace_lock[idx]; - atomic_t *alloc_workspace = &comp_alloc_workspace[idx]; - wait_queue_head_t *workspace_wait = &comp_workspace_wait[idx]; - int *num_workspace = &comp_num_workspace[idx]; - - spin_lock(workspace_lock); - if (*num_workspace < num_online_cpus()) { - list_add(workspace, idle_workspace); - (*num_workspace)++; - spin_unlock(workspace_lock); + struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws; + spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock; + atomic_t *alloc_ws = &btrfs_comp_ws[idx].alloc_ws; + wait_queue_head_t *ws_wait = &btrfs_comp_ws[idx].ws_wait; + int *num_ws = &btrfs_comp_ws[idx].num_ws; + + spin_lock(ws_lock); + if (*num_ws < num_online_cpus()) { + list_add(workspace, idle_ws); + (*num_ws)++; + spin_unlock(ws_lock); goto wake; } - spin_unlock(workspace_lock); + spin_unlock(ws_lock); btrfs_compress_op[idx]->free_workspace(workspace); - atomic_dec(alloc_workspace); + atomic_dec(alloc_ws); wake: + /* + * Make sure counter is updated before we wake up waiters. + */ smp_mb(); - if (waitqueue_active(workspace_wait)) - wake_up(workspace_wait); + if (waitqueue_active(ws_wait)) + wake_up(ws_wait); } /* @@ -853,11 +858,11 @@ static void free_workspaces(void) int i; for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) { - while (!list_empty(&comp_idle_workspace[i])) { - workspace = comp_idle_workspace[i].next; + while (!list_empty(&btrfs_comp_ws[i].idle_ws)) { + workspace = btrfs_comp_ws[i].idle_ws.next; list_del(workspace); btrfs_compress_op[i]->free_workspace(workspace); - atomic_dec(&comp_alloc_workspace[i]); + atomic_dec(&btrfs_comp_ws[i].alloc_ws); } } } diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 5f745ea..5b8e235 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1011,7 +1011,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, return ret; if (refs == 0) { ret = -EROFS; - btrfs_std_error(root->fs_info, ret); + btrfs_std_error(root->fs_info, ret, NULL); return ret; } } else { @@ -1927,7 +1927,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, child = read_node_slot(root, mid, 0); if (!child) { ret = -EROFS; - btrfs_std_error(root->fs_info, ret); + btrfs_std_error(root->fs_info, ret, NULL); goto enospc; } @@ -2030,7 +2030,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, */ if (!left) { ret = -EROFS; - btrfs_std_error(root->fs_info, ret); + btrfs_std_error(root->fs_info, ret, NULL); goto enospc; } wret = balance_node_right(trans, root, mid, left); @@ -4940,8 +4940,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, { struct extent_buffer *leaf; struct btrfs_item *item; - int last_off; - int dsize = 0; + u32 last_off; + u32 dsize = 0; int ret = 0; int wret; int i; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 938efe3..a2e73f6 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -823,8 +823,18 @@ struct btrfs_disk_balance_args { */ __le64 profiles; - /* usage filter */ - __le64 usage; + /* + * usage filter + * BTRFS_BALANCE_ARGS_USAGE with a single value means '0..N' + * BTRFS_BALANCE_ARGS_USAGE_RANGE - range syntax, min..max + */ + union { + __le64 usage; + struct { + __le32 usage_min; + __le32 usage_max; + }; + }; /* devid filter */ __le64 devid; @@ -846,10 +856,27 @@ struct btrfs_disk_balance_args { /* BTRFS_BALANCE_ARGS_* */ __le64 flags; - /* BTRFS_BALANCE_ARGS_LIMIT value */ - __le64 limit; + /* + * BTRFS_BALANCE_ARGS_LIMIT with value 'limit' + * BTRFS_BALANCE_ARGS_LIMIT_RANGE - the extend version can use minimum + * and maximum + */ + union { + __le64 limit; + struct { + __le32 limit_min; + __le32 limit_max; + }; + }; - __le64 unused[7]; + /* + * Process chunks that cross stripes_min..stripes_max devices, + * BTRFS_BALANCE_ARGS_STRIPES_RANGE + */ + __le32 stripes_min; + __le32 stripes_max; + + __le64 unused[6]; } __attribute__ ((__packed__)); /* @@ -1154,6 +1181,10 @@ struct btrfs_space_info { delalloc/allocations */ u64 bytes_readonly; /* total bytes that are read only */ + u64 max_extent_size; /* This will hold the maximum extent size of + the space info if we had an ENOSPC in the + allocator. */ + unsigned int full:1; /* indicates that we cannot allocate any more chunks for this space */ unsigned int chunk_alloc:1; /* set if we are allocating a chunk */ @@ -1228,6 +1259,9 @@ struct btrfs_free_cluster { /* first extent starting offset */ u64 window_start; + /* We did a full search and couldn't create a cluster */ + bool fragmented; + struct btrfs_block_group_cache *block_group; /* * when a cluster is allocated from a block group, we put the @@ -1943,6 +1977,9 @@ struct btrfs_root { int send_in_progress; struct btrfs_subvolume_writers *subv_writers; atomic_t will_be_snapshoted; + + /* For qgroup metadata space reserve */ + atomic_t qgroup_meta_rsv; }; struct btrfs_ioctl_defrag_range_args { @@ -2145,6 +2182,8 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21) #define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR (1 << 22) #define BTRFS_MOUNT_RESCAN_UUID_TREE (1 << 23) +#define BTRFS_MOUNT_FRAGMENT_DATA (1 << 24) +#define BTRFS_MOUNT_FRAGMENT_METADATA (1 << 25) #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) #define BTRFS_DEFAULT_MAX_INLINE (8192) @@ -2169,6 +2208,18 @@ struct btrfs_ioctl_defrag_range_args { btrfs_clear_opt(root->fs_info->mount_opt, opt); \ } +#ifdef CONFIG_BTRFS_DEBUG +static inline int +btrfs_should_fragment_free_space(struct btrfs_root *root, + struct btrfs_block_group_cache *block_group) +{ + return (btrfs_test_opt(root, FRAGMENT_METADATA) && + block_group->flags & BTRFS_BLOCK_GROUP_METADATA) || + (btrfs_test_opt(root, FRAGMENT_DATA) && + block_group->flags & BTRFS_BLOCK_GROUP_DATA); +} +#endif + /* * Requests for changes that need to be done during transaction commit. * @@ -3379,7 +3430,8 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 root_objectid, u64 owner, - u64 offset, struct btrfs_key *ins); + u64 offset, u64 ram_bytes, + struct btrfs_key *ins); int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 root_objectid, u64 owner, u64 offset, @@ -3398,7 +3450,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, - u64 owner, u64 offset, int no_quota); + u64 owner, u64 offset); int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len, int delalloc); @@ -3411,7 +3463,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner, u64 offset, int no_quota); + u64 root_objectid, u64 owner, u64 offset); int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root); @@ -3449,8 +3501,11 @@ enum btrfs_reserve_flush_enum { BTRFS_RESERVE_FLUSH_ALL, }; -int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes); -void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); +int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len); +int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes); +void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len); +void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, + u64 len); void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, struct btrfs_root *root); void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); @@ -3466,8 +3521,8 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root, u64 qgroup_reserved); int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes); void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes); -int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes); -void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes); +int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len); +void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len); void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type); struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root, unsigned short type); @@ -4004,8 +4059,8 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, /* sysfs.c */ int btrfs_init_sysfs(void); void btrfs_exit_sysfs(void); -int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info); -void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info); +int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info); +void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info); /* xattr.c */ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size); @@ -4039,14 +4094,102 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) #define btrfs_info(fs_info, fmt, args...) \ btrfs_printk(fs_info, KERN_INFO fmt, ##args) +/* + * Wrappers that use printk_in_rcu + */ +#define btrfs_emerg_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_EMERG fmt, ##args) +#define btrfs_alert_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_ALERT fmt, ##args) +#define btrfs_crit_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_CRIT fmt, ##args) +#define btrfs_err_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_ERR fmt, ##args) +#define btrfs_warn_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_WARNING fmt, ##args) +#define btrfs_notice_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_NOTICE fmt, ##args) +#define btrfs_info_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_INFO fmt, ##args) + +/* + * Wrappers that use a ratelimited printk_in_rcu + */ +#define btrfs_emerg_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_EMERG fmt, ##args) +#define btrfs_alert_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_ALERT fmt, ##args) +#define btrfs_crit_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_CRIT fmt, ##args) +#define btrfs_err_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_ERR fmt, ##args) +#define btrfs_warn_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_WARNING fmt, ##args) +#define btrfs_notice_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_NOTICE fmt, ##args) +#define btrfs_info_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_INFO fmt, ##args) + +/* + * Wrappers that use a ratelimited printk + */ +#define btrfs_emerg_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_EMERG fmt, ##args) +#define btrfs_alert_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_ALERT fmt, ##args) +#define btrfs_crit_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_CRIT fmt, ##args) +#define btrfs_err_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_ERR fmt, ##args) +#define btrfs_warn_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_WARNING fmt, ##args) +#define btrfs_notice_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_NOTICE fmt, ##args) +#define btrfs_info_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_INFO fmt, ##args) #ifdef DEBUG #define btrfs_debug(fs_info, fmt, args...) \ btrfs_printk(fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_DEBUG fmt, ##args) #else #define btrfs_debug(fs_info, fmt, args...) \ no_printk(KERN_DEBUG fmt, ##args) +#define btrfs_debug_in_rcu(fs_info, fmt, args...) \ + no_printk(KERN_DEBUG fmt, ##args) +#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \ + no_printk(KERN_DEBUG fmt, ##args) +#define btrfs_debug_rl(fs_info, fmt, args...) \ + no_printk(KERN_DEBUG fmt, ##args) #endif +#define btrfs_printk_in_rcu(fs_info, fmt, args...) \ +do { \ + rcu_read_lock(); \ + btrfs_printk(fs_info, fmt, ##args); \ + rcu_read_unlock(); \ +} while (0) + +#define btrfs_printk_ratelimited(fs_info, fmt, args...) \ +do { \ + static DEFINE_RATELIMIT_STATE(_rs, \ + DEFAULT_RATELIMIT_INTERVAL, \ + DEFAULT_RATELIMIT_BURST); \ + if (__ratelimit(&_rs)) \ + btrfs_printk(fs_info, fmt, ##args); \ +} while (0) + +#define btrfs_printk_rl_in_rcu(fs_info, fmt, args...) \ +do { \ + rcu_read_lock(); \ + btrfs_printk_ratelimited(fs_info, fmt, ##args); \ + rcu_read_unlock(); \ +} while (0) + #ifdef CONFIG_BTRFS_ASSERT __cold @@ -4127,14 +4270,7 @@ do { \ __LINE__, (errno)); \ } while (0) -#define btrfs_std_error(fs_info, errno) \ -do { \ - if ((errno)) \ - __btrfs_std_error((fs_info), __func__, \ - __LINE__, (errno), NULL); \ -} while (0) - -#define btrfs_error(fs_info, errno, fmt, args...) \ +#define btrfs_std_error(fs_info, errno, fmt, args...) \ do { \ __btrfs_std_error((fs_info), __func__, __LINE__, \ (errno), fmt, ##args); \ diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index a2ae427..e0941fb 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -463,6 +463,10 @@ static int __btrfs_add_delayed_deletion_item(struct btrfs_delayed_node *node, static void finish_one_item(struct btrfs_delayed_root *delayed_root) { int seq = atomic_inc_return(&delayed_root->items_seq); + + /* + * atomic_dec_return implies a barrier for waitqueue_active + */ if ((atomic_dec_return(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND || seq % BTRFS_DELAYED_BATCH == 0) && waitqueue_active(&delayed_root->wait)) diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index ac3e81d..e06dd75 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -197,6 +197,119 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, trans->delayed_ref_updates--; } +static bool merge_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head, + struct btrfs_delayed_ref_node *ref, + u64 seq) +{ + struct btrfs_delayed_ref_node *next; + bool done = false; + + next = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node, + list); + while (!done && &next->list != &head->ref_list) { + int mod; + struct btrfs_delayed_ref_node *next2; + + next2 = list_next_entry(next, list); + + if (next == ref) + goto next; + + if (seq && next->seq >= seq) + goto next; + + if (next->type != ref->type) + goto next; + + if ((ref->type == BTRFS_TREE_BLOCK_REF_KEY || + ref->type == BTRFS_SHARED_BLOCK_REF_KEY) && + comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref), + btrfs_delayed_node_to_tree_ref(next), + ref->type)) + goto next; + if ((ref->type == BTRFS_EXTENT_DATA_REF_KEY || + ref->type == BTRFS_SHARED_DATA_REF_KEY) && + comp_data_refs(btrfs_delayed_node_to_data_ref(ref), + btrfs_delayed_node_to_data_ref(next))) + goto next; + + if (ref->action == next->action) { + mod = next->ref_mod; + } else { + if (ref->ref_mod < next->ref_mod) { + swap(ref, next); + done = true; + } + mod = -next->ref_mod; + } + + drop_delayed_ref(trans, delayed_refs, head, next); + ref->ref_mod += mod; + if (ref->ref_mod == 0) { + drop_delayed_ref(trans, delayed_refs, head, ref); + done = true; + } else { + /* + * Can't have multiples of the same ref on a tree block. + */ + WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY || + ref->type == BTRFS_SHARED_BLOCK_REF_KEY); + } +next: + next = next2; + } + + return done; +} + +void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head) +{ + struct btrfs_delayed_ref_node *ref; + u64 seq = 0; + + assert_spin_locked(&head->lock); + + if (list_empty(&head->ref_list)) + return; + + /* We don't have too many refs to merge for data. */ + if (head->is_data) + return; + + spin_lock(&fs_info->tree_mod_seq_lock); + if (!list_empty(&fs_info->tree_mod_seq_list)) { + struct seq_list *elem; + + elem = list_first_entry(&fs_info->tree_mod_seq_list, + struct seq_list, list); + seq = elem->seq; + } + spin_unlock(&fs_info->tree_mod_seq_lock); + + ref = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node, + list); + while (&ref->list != &head->ref_list) { + if (seq && ref->seq >= seq) + goto next; + + if (merge_ref(trans, delayed_refs, head, ref, seq)) { + if (list_empty(&head->ref_list)) + break; + ref = list_first_entry(&head->ref_list, + struct btrfs_delayed_ref_node, + list); + continue; + } +next: + ref = list_next_entry(ref, list); + } +} + int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, u64 seq) @@ -292,8 +405,7 @@ add_delayed_ref_tail_merge(struct btrfs_trans_handle *trans, exist = list_entry(href->ref_list.prev, struct btrfs_delayed_ref_node, list); /* No need to compare bytenr nor is_head */ - if (exist->type != ref->type || exist->no_quota != ref->no_quota || - exist->seq != ref->seq) + if (exist->type != ref->type || exist->seq != ref->seq) goto add_tail; if ((exist->type == BTRFS_TREE_BLOCK_REF_KEY || @@ -423,7 +535,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *ref, struct btrfs_qgroup_extent_record *qrecord, - u64 bytenr, u64 num_bytes, int action, int is_data) + u64 bytenr, u64 num_bytes, u64 ref_root, u64 reserved, + int action, int is_data) { struct btrfs_delayed_ref_head *existing; struct btrfs_delayed_ref_head *head_ref = NULL; @@ -432,6 +545,9 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, int count_mod = 1; int must_insert_reserved = 0; + /* If reserved is provided, it must be a data extent. */ + BUG_ON(!is_data && reserved); + /* * the head node stores the sum of all the mods, so dropping a ref * should drop the sum in the head node by one. @@ -476,9 +592,16 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, INIT_LIST_HEAD(&head_ref->ref_list); head_ref->processing = 0; head_ref->total_ref_mod = count_mod; + head_ref->qgroup_reserved = 0; + head_ref->qgroup_ref_root = 0; /* Record qgroup extent info if provided */ if (qrecord) { + if (ref_root && reserved) { + head_ref->qgroup_ref_root = ref_root; + head_ref->qgroup_reserved = reserved; + } + qrecord->bytenr = bytenr; qrecord->num_bytes = num_bytes; qrecord->old_roots = NULL; @@ -497,6 +620,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, existing = htree_insert(&delayed_refs->href_root, &head_ref->href_node); if (existing) { + WARN_ON(ref_root && reserved && existing->qgroup_ref_root + && existing->qgroup_reserved); update_existing_head_ref(delayed_refs, &existing->node, ref); /* * we've updated the existing ref, free the newly @@ -524,7 +649,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_head *head_ref, struct btrfs_delayed_ref_node *ref, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, int level, - int action, int no_quota) + int action) { struct btrfs_delayed_tree_ref *full_ref; struct btrfs_delayed_ref_root *delayed_refs; @@ -546,7 +671,6 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info, ref->action = action; ref->is_head = 0; ref->in_tree = 1; - ref->no_quota = no_quota; ref->seq = seq; full_ref = btrfs_delayed_node_to_tree_ref(ref); @@ -579,7 +703,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_head *head_ref, struct btrfs_delayed_ref_node *ref, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, u64 owner, - u64 offset, int action, int no_quota) + u64 offset, int action) { struct btrfs_delayed_data_ref *full_ref; struct btrfs_delayed_ref_root *delayed_refs; @@ -602,7 +726,6 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info, ref->action = action; ref->is_head = 0; ref->in_tree = 1; - ref->no_quota = no_quota; ref->seq = seq; full_ref = btrfs_delayed_node_to_data_ref(ref); @@ -633,17 +756,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, int level, int action, - struct btrfs_delayed_extent_op *extent_op, - int no_quota) + struct btrfs_delayed_extent_op *extent_op) { struct btrfs_delayed_tree_ref *ref; struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_qgroup_extent_record *record = NULL; - if (!is_fstree(ref_root) || !fs_info->quota_enabled) - no_quota = 0; - BUG_ON(extent_op && extent_op->is_data); ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS); if (!ref) @@ -669,11 +788,10 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, * the spin lock */ head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record, - bytenr, num_bytes, action, 0); + bytenr, num_bytes, 0, 0, action, 0); add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr, - num_bytes, parent, ref_root, level, action, - no_quota); + num_bytes, parent, ref_root, level, action); spin_unlock(&delayed_refs->lock); return 0; @@ -693,18 +811,14 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, - u64 owner, u64 offset, int action, - struct btrfs_delayed_extent_op *extent_op, - int no_quota) + u64 owner, u64 offset, u64 reserved, int action, + struct btrfs_delayed_extent_op *extent_op) { struct btrfs_delayed_data_ref *ref; struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_qgroup_extent_record *record = NULL; - if (!is_fstree(ref_root) || !fs_info->quota_enabled) - no_quota = 0; - BUG_ON(extent_op && !extent_op->is_data); ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS); if (!ref) @@ -736,16 +850,44 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, * the spin lock */ head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record, - bytenr, num_bytes, action, 1); + bytenr, num_bytes, ref_root, reserved, + action, 1); add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr, num_bytes, parent, ref_root, owner, offset, - action, no_quota); + action); spin_unlock(&delayed_refs->lock); return 0; } +int btrfs_add_delayed_qgroup_reserve(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, + u64 ref_root, u64 bytenr, u64 num_bytes) +{ + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_ref_head *ref_head; + int ret = 0; + + if (!fs_info->quota_enabled || !is_fstree(ref_root)) + return 0; + + delayed_refs = &trans->transaction->delayed_refs; + + spin_lock(&delayed_refs->lock); + ref_head = find_ref_head(&delayed_refs->href_root, bytenr, 0); + if (!ref_head) { + ret = -ENOENT; + goto out; + } + WARN_ON(ref_head->qgroup_reserved || ref_head->qgroup_ref_root); + ref_head->qgroup_ref_root = ref_root; + ref_head->qgroup_reserved = num_bytes; +out: + spin_unlock(&delayed_refs->lock); + return ret; +} + int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, @@ -764,7 +906,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, spin_lock(&delayed_refs->lock); add_delayed_ref_head(fs_info, trans, &head_ref->node, NULL, bytenr, - num_bytes, BTRFS_UPDATE_DELAYED_HEAD, + num_bytes, 0, 0, BTRFS_UPDATE_DELAYED_HEAD, extent_op->is_data); spin_unlock(&delayed_refs->lock); diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 13fb5e6..00ed02c 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -68,7 +68,6 @@ struct btrfs_delayed_ref_node { unsigned int action:8; unsigned int type:8; - unsigned int no_quota:1; /* is this node still in the rbtree? */ unsigned int is_head:1; unsigned int in_tree:1; @@ -113,6 +112,17 @@ struct btrfs_delayed_ref_head { int total_ref_mod; /* + * For qgroup reserved space freeing. + * + * ref_root and reserved will be recorded after + * BTRFS_ADD_DELAYED_EXTENT is called. + * And will be used to free reserved qgroup space at + * run_delayed_refs() time. + */ + u64 qgroup_ref_root; + u64 qgroup_reserved; + + /* * when a new extent is allocated, it is just reserved in memory * The actual extent isn't inserted into the extent allocation tree * until the delayed ref is processed. must_insert_reserved is @@ -233,15 +243,16 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, int level, int action, - struct btrfs_delayed_extent_op *extent_op, - int no_quota); + struct btrfs_delayed_extent_op *extent_op); int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, - u64 owner, u64 offset, int action, - struct btrfs_delayed_extent_op *extent_op, - int no_quota); + u64 owner, u64 offset, u64 reserved, int action, + struct btrfs_delayed_extent_op *extent_op); +int btrfs_add_delayed_qgroup_reserve(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, + u64 ref_root, u64 bytenr, u64 num_bytes); int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index e54dd59..1e668fb 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -327,19 +327,6 @@ int btrfs_dev_replace_start(struct btrfs_root *root, args->start.tgtdev_name[0] == '\0') return -EINVAL; - /* - * Here we commit the transaction to make sure commit_total_bytes - * of all the devices are updated. - */ - trans = btrfs_attach_transaction(root); - if (!IS_ERR(trans)) { - ret = btrfs_commit_transaction(trans, root); - if (ret) - return ret; - } else if (PTR_ERR(trans) != -ENOENT) { - return PTR_ERR(trans); - } - /* the disk copy procedure reuses the scrub code */ mutex_lock(&fs_info->volume_mutex); ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid, @@ -356,6 +343,19 @@ int btrfs_dev_replace_start(struct btrfs_root *root, if (ret) return ret; + /* + * Here we commit the transaction to make sure commit_total_bytes + * of all the devices are updated. + */ + trans = btrfs_attach_transaction(root); + if (!IS_ERR(trans)) { + ret = btrfs_commit_transaction(trans, root); + if (ret) + return ret; + } else if (PTR_ERR(trans) != -ENOENT) { + return PTR_ERR(trans); + } + btrfs_dev_replace_lock(dev_replace); switch (dev_replace->replace_state) { case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: @@ -375,12 +375,8 @@ int btrfs_dev_replace_start(struct btrfs_root *root, WARN_ON(!tgt_device); dev_replace->tgtdev = tgt_device; - ret = btrfs_kobj_add_device(tgt_device->fs_devices, tgt_device); - if (ret) - btrfs_err(root->fs_info, "kobj add dev failed %d\n", ret); - - printk_in_rcu(KERN_INFO - "BTRFS: dev_replace from %s (devid %llu) to %s started\n", + btrfs_info_in_rcu(root->fs_info, + "dev_replace from %s (devid %llu) to %s started", src_device->missing ? "<missing disk>" : rcu_str_deref(src_device->name), src_device->devid, @@ -401,6 +397,10 @@ int btrfs_dev_replace_start(struct btrfs_root *root, args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; btrfs_dev_replace_unlock(dev_replace); + ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device); + if (ret) + btrfs_err(root->fs_info, "kobj add dev failed %d\n", ret); + btrfs_wait_ordered_roots(root->fs_info, -1); /* force writing the updated state information to disk */ @@ -454,8 +454,7 @@ static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info) static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info) { clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state); - if (waitqueue_active(&fs_info->replace_wait)) - wake_up(&fs_info->replace_wait); + wake_up(&fs_info->replace_wait); } static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, @@ -523,8 +522,8 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, src_device, tgt_device); } else { - printk_in_rcu(KERN_ERR - "BTRFS: btrfs_scrub_dev(%s, %llu, %s) failed %d\n", + btrfs_err_in_rcu(root->fs_info, + "btrfs_scrub_dev(%s, %llu, %s) failed %d", src_device->missing ? "<missing disk>" : rcu_str_deref(src_device->name), src_device->devid, @@ -540,8 +539,8 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, return scrub_ret; } - printk_in_rcu(KERN_INFO - "BTRFS: dev_replace from %s (devid %llu) to %s finished\n", + btrfs_info_in_rcu(root->fs_info, + "dev_replace from %s (devid %llu) to %s finished", src_device->missing ? "<missing disk>" : rcu_str_deref(src_device->name), src_device->devid, @@ -586,7 +585,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, mutex_unlock(&uuid_mutex); /* replace the sysfs entry */ - btrfs_kobj_rm_device(fs_info->fs_devices, src_device); + btrfs_sysfs_rm_device_link(fs_info->fs_devices, src_device); btrfs_rm_dev_replace_free_srcdev(fs_info, src_device); /* write back the superblocks */ @@ -809,8 +808,8 @@ static int btrfs_dev_replace_kthread(void *data) progress = status_args->status.progress_1000; kfree(status_args); progress = div_u64(progress, 10); - printk_in_rcu(KERN_INFO - "BTRFS: continuing dev_replace from %s (devid %llu) to %s @%u%%\n", + btrfs_info_in_rcu(fs_info, + "continuing dev_replace from %s (devid %llu) to %s @%u%%", dev_replace->srcdev->missing ? "<missing disk>" : rcu_str_deref(dev_replace->srcdev->name), dev_replace->srcdev->devid, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1e60d00..2d46675 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -319,9 +319,9 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info, memcpy(&found, result, csum_size); read_extent_buffer(buf, &val, 0, csum_size); - printk_ratelimited(KERN_WARNING - "BTRFS: %s checksum verify failed on %llu wanted %X found %X " - "level %d\n", + btrfs_warn_rl(fs_info, + "%s checksum verify failed on %llu wanted %X found %X " + "level %d", fs_info->sb->s_id, buf->start, val, found, btrfs_header_level(buf)); if (result != (char *)&inline_result) @@ -368,9 +368,9 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, ret = 0; goto out; } - printk_ratelimited(KERN_ERR - "BTRFS (device %s): parent transid verify failed on %llu wanted %llu found %llu\n", - eb->fs_info->sb->s_id, eb->start, + btrfs_err_rl(eb->fs_info, + "parent transid verify failed on %llu wanted %llu found %llu", + eb->start, parent_transid, btrfs_header_generation(eb)); ret = 1; @@ -629,15 +629,14 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, found_start = btrfs_header_bytenr(eb); if (found_start != eb->start) { - printk_ratelimited(KERN_ERR "BTRFS (device %s): bad tree block start " - "%llu %llu\n", - eb->fs_info->sb->s_id, found_start, eb->start); + btrfs_err_rl(eb->fs_info, "bad tree block start %llu %llu", + found_start, eb->start); ret = -EIO; goto err; } if (check_tree_block_fsid(root->fs_info, eb)) { - printk_ratelimited(KERN_ERR "BTRFS (device %s): bad fsid on block %llu\n", - eb->fs_info->sb->s_id, eb->start); + btrfs_err_rl(eb->fs_info, "bad fsid on block %llu", + eb->start); ret = -EIO; goto err; } @@ -802,6 +801,9 @@ static void run_one_async_done(struct btrfs_work *work) limit = btrfs_async_submit_limit(fs_info); limit = limit * 2 / 3; + /* + * atomic_dec_return implies a barrier for waitqueue_active + */ if (atomic_dec_return(&fs_info->nr_async_submits) < limit && waitqueue_active(&fs_info->async_submit_wait)) wake_up(&fs_info->async_submit_wait); @@ -1265,6 +1267,7 @@ static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize, atomic_set(&root->orphan_inodes, 0); atomic_set(&root->refs, 1); atomic_set(&root->will_be_snapshoted, 0); + atomic_set(&root->qgroup_meta_rsv, 0); root->log_transid = 0; root->log_transid_committed = -1; root->last_log_commit = 0; @@ -1759,6 +1762,7 @@ static int cleaner_kthread(void *arg) int again; struct btrfs_trans_handle *trans; + set_freezable(); do { again = 0; @@ -2348,8 +2352,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, u64 bytenr = btrfs_super_log_root(disk_super); if (fs_devices->rw_devices == 0) { - printk(KERN_WARNING "BTRFS: log replay required " - "on RO media\n"); + btrfs_warn(fs_info, "log replay required on RO media"); return -EIO; } @@ -2364,12 +2367,12 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, log_tree_root->node = read_tree_block(tree_root, bytenr, fs_info->generation + 1); if (IS_ERR(log_tree_root->node)) { - printk(KERN_ERR "BTRFS: failed to read log tree\n"); + btrfs_warn(fs_info, "failed to read log tree"); ret = PTR_ERR(log_tree_root->node); kfree(log_tree_root); return ret; } else if (!extent_buffer_uptodate(log_tree_root->node)) { - printk(KERN_ERR "BTRFS: failed to read log tree\n"); + btrfs_err(fs_info, "failed to read log tree"); free_extent_buffer(log_tree_root->node); kfree(log_tree_root); return -EIO; @@ -2377,7 +2380,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, /* returns with log_tree_root freed on success */ ret = btrfs_recover_log_trees(log_tree_root); if (ret) { - btrfs_error(tree_root->fs_info, ret, + btrfs_std_error(tree_root->fs_info, ret, "Failed to recover log tree"); free_extent_buffer(log_tree_root->node); kfree(log_tree_root); @@ -2653,8 +2656,8 @@ int open_ctree(struct super_block *sb, * Read super block and check the signature bytes only */ bh = btrfs_read_dev_super(fs_devices->latest_bdev); - if (!bh) { - err = -EINVAL; + if (IS_ERR(bh)) { + err = PTR_ERR(bh); goto fail_alloc; } @@ -2937,7 +2940,7 @@ retry_root_backup: goto fail_fsdev_sysfs; } - ret = btrfs_sysfs_add_one(fs_info); + ret = btrfs_sysfs_add_mounted(fs_info); if (ret) { pr_err("BTRFS: failed to init sysfs interface: %d\n", ret); goto fail_fsdev_sysfs; @@ -3117,7 +3120,7 @@ fail_cleaner: filemap_write_and_wait(fs_info->btree_inode->i_mapping); fail_sysfs: - btrfs_sysfs_remove_one(fs_info); + btrfs_sysfs_remove_mounted(fs_info); fail_fsdev_sysfs: btrfs_sysfs_remove_fsid(fs_info->fs_devices); @@ -3179,8 +3182,8 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) struct btrfs_device *device = (struct btrfs_device *) bh->b_private; - printk_ratelimited_in_rcu(KERN_WARNING "BTRFS: lost page write due to " - "I/O error on %s\n", + btrfs_warn_rl_in_rcu(device->dev_root->fs_info, + "lost page write due to IO error on %s", rcu_str_deref(device->name)); /* note, we dont' set_buffer_write_io_error because we have * our own ways of dealing with the IO errors @@ -3192,6 +3195,37 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) put_bh(bh); } +int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num, + struct buffer_head **bh_ret) +{ + struct buffer_head *bh; + struct btrfs_super_block *super; + u64 bytenr; + + bytenr = btrfs_sb_offset(copy_num); + if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode)) + return -EINVAL; + + bh = __bread(bdev, bytenr / 4096, BTRFS_SUPER_INFO_SIZE); + /* + * If we fail to read from the underlying devices, as of now + * the best option we have is to mark it EIO. + */ + if (!bh) + return -EIO; + + super = (struct btrfs_super_block *)bh->b_data; + if (btrfs_super_bytenr(super) != bytenr || + btrfs_super_magic(super) != BTRFS_MAGIC) { + brelse(bh); + return -EINVAL; + } + + *bh_ret = bh; + return 0; +} + + struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) { struct buffer_head *bh; @@ -3199,7 +3233,7 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) struct btrfs_super_block *super; int i; u64 transid = 0; - u64 bytenr; + int ret = -EINVAL; /* we would like to check all the supers, but that would make * a btrfs mount succeed after a mkfs from a different FS. @@ -3207,21 +3241,11 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) * later supers, using BTRFS_SUPER_MIRROR_MAX instead */ for (i = 0; i < 1; i++) { - bytenr = btrfs_sb_offset(i); - if (bytenr + BTRFS_SUPER_INFO_SIZE >= - i_size_read(bdev->bd_inode)) - break; - bh = __bread(bdev, bytenr / 4096, - BTRFS_SUPER_INFO_SIZE); - if (!bh) + ret = btrfs_read_dev_one_super(bdev, i, &bh); + if (ret) continue; super = (struct btrfs_super_block *)bh->b_data; - if (btrfs_super_bytenr(super) != bytenr || - btrfs_super_magic(super) != BTRFS_MAGIC) { - brelse(bh); - continue; - } if (!latest || btrfs_super_generation(super) > transid) { brelse(latest); @@ -3231,6 +3255,10 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) brelse(bh); } } + + if (!latest) + return ERR_PTR(ret); + return latest; } @@ -3299,8 +3327,9 @@ static int write_dev_supers(struct btrfs_device *device, bh = __getblk(device->bdev, bytenr / 4096, BTRFS_SUPER_INFO_SIZE); if (!bh) { - printk(KERN_ERR "BTRFS: couldn't get super " - "buffer head for bytenr %Lu\n", bytenr); + btrfs_err(device->dev_root->fs_info, + "couldn't get super buffer head for bytenr %llu", + bytenr); errors++; continue; } @@ -3449,22 +3478,31 @@ static int barrier_all_devices(struct btrfs_fs_info *info) int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags) { - if ((flags & (BTRFS_BLOCK_GROUP_DUP | - BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_AVAIL_ALLOC_BIT_SINGLE)) || - ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0)) - return 0; + int raid_type; + int min_tolerated = INT_MAX; - if (flags & (BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID10)) - return 1; + if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 || + (flags & BTRFS_AVAIL_ALLOC_BIT_SINGLE)) + min_tolerated = min(min_tolerated, + btrfs_raid_array[BTRFS_RAID_SINGLE]. + tolerated_failures); - if (flags & BTRFS_BLOCK_GROUP_RAID6) - return 2; + for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) { + if (raid_type == BTRFS_RAID_SINGLE) + continue; + if (!(flags & btrfs_raid_group[raid_type])) + continue; + min_tolerated = min(min_tolerated, + btrfs_raid_array[raid_type]. + tolerated_failures); + } - pr_warn("BTRFS: unknown raid type: %llu\n", flags); - return 0; + if (min_tolerated == INT_MAX) { + pr_warn("BTRFS: unknown raid flag: %llu\n", flags); + min_tolerated = 0; + } + + return min_tolerated; } int btrfs_calc_num_tolerated_disk_barrier_failures( @@ -3548,7 +3586,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors) if (ret) { mutex_unlock( &root->fs_info->fs_devices->device_list_mutex); - btrfs_error(root->fs_info, ret, + btrfs_std_error(root->fs_info, ret, "errors while submitting device barriers."); return ret; } @@ -3588,7 +3626,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors) mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); /* FUA is masked off if unsupported and can't be the reason */ - btrfs_error(root->fs_info, -EIO, + btrfs_std_error(root->fs_info, -EIO, "%d errors while writing supers", total_errors); return -EIO; } @@ -3606,7 +3644,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors) } mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); if (total_errors > max_errors) { - btrfs_error(root->fs_info, -EIO, + btrfs_std_error(root->fs_info, -EIO, "%d errors while writing supers", total_errors); return -EIO; } @@ -3792,7 +3830,7 @@ void close_ctree(struct btrfs_root *root) percpu_counter_sum(&fs_info->delalloc_bytes)); } - btrfs_sysfs_remove_one(fs_info); + btrfs_sysfs_remove_mounted(fs_info); btrfs_sysfs_remove_fsid(fs_info->fs_devices); btrfs_free_fs_roots(fs_info); @@ -4290,25 +4328,6 @@ again: return 0; } -static void btrfs_free_pending_ordered(struct btrfs_transaction *cur_trans, - struct btrfs_fs_info *fs_info) -{ - struct btrfs_ordered_extent *ordered; - - spin_lock(&fs_info->trans_lock); - while (!list_empty(&cur_trans->pending_ordered)) { - ordered = list_first_entry(&cur_trans->pending_ordered, - struct btrfs_ordered_extent, - trans_list); - list_del_init(&ordered->trans_list); - spin_unlock(&fs_info->trans_lock); - - btrfs_put_ordered_extent(ordered); - spin_lock(&fs_info->trans_lock); - } - spin_unlock(&fs_info->trans_lock); -} - void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, struct btrfs_root *root) { @@ -4320,7 +4339,6 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, cur_trans->state = TRANS_STATE_UNBLOCKED; wake_up(&root->fs_info->transaction_wait); - btrfs_free_pending_ordered(cur_trans, root->fs_info); btrfs_destroy_delayed_inodes(root); btrfs_assert_delayed_root_empty(root); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index bdfb479..adeb318 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -60,6 +60,8 @@ void close_ctree(struct btrfs_root *root); int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root *root, int max_mirrors); struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); +int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num, + struct buffer_head **bh_ret); int btrfs_commit_super(struct btrfs_root *root); struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 601d7d4..99a8e57 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -95,8 +95,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 parent, u64 root_objectid, u64 flags, struct btrfs_disk_key *key, - int level, struct btrfs_key *ins, - int no_quota); + int level, struct btrfs_key *ins); static int do_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 flags, int force); @@ -332,6 +331,27 @@ static void put_caching_control(struct btrfs_caching_control *ctl) kfree(ctl); } +#ifdef CONFIG_BTRFS_DEBUG +static void fragment_free_space(struct btrfs_root *root, + struct btrfs_block_group_cache *block_group) +{ + u64 start = block_group->key.objectid; + u64 len = block_group->key.offset; + u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ? + root->nodesize : root->sectorsize; + u64 step = chunk << 1; + + while (len > chunk) { + btrfs_remove_free_space(block_group, start, chunk); + start += step; + if (len < step) + len = 0; + else + len -= step; + } +} +#endif + /* * this is only called by cache_block_group, since we could have freed extents * we need to check the pinned_extents for any extents that can't be used yet @@ -388,6 +408,7 @@ static noinline void caching_thread(struct btrfs_work *work) u64 last = 0; u32 nritems; int ret = -ENOMEM; + bool wakeup = true; caching_ctl = container_of(work, struct btrfs_caching_control, work); block_group = caching_ctl->block_group; @@ -400,6 +421,15 @@ static noinline void caching_thread(struct btrfs_work *work) last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); +#ifdef CONFIG_BTRFS_DEBUG + /* + * If we're fragmenting we don't want to make anybody think we can + * allocate from this block group until we've had a chance to fragment + * the free space. + */ + if (btrfs_should_fragment_free_space(extent_root, block_group)) + wakeup = false; +#endif /* * We don't want to deadlock with somebody trying to allocate a new * extent for the extent root while also trying to search the extent @@ -441,7 +471,8 @@ next: if (need_resched() || rwsem_is_contended(&fs_info->commit_root_sem)) { - caching_ctl->progress = last; + if (wakeup) + caching_ctl->progress = last; btrfs_release_path(path); up_read(&fs_info->commit_root_sem); mutex_unlock(&caching_ctl->mutex); @@ -464,7 +495,8 @@ next: key.offset = 0; key.type = BTRFS_EXTENT_ITEM_KEY; - caching_ctl->progress = last; + if (wakeup) + caching_ctl->progress = last; btrfs_release_path(path); goto next; } @@ -491,7 +523,8 @@ next: if (total_found > (1024 * 1024 * 2)) { total_found = 0; - wake_up(&caching_ctl->wait); + if (wakeup) + wake_up(&caching_ctl->wait); } } path->slots[0]++; @@ -501,13 +534,27 @@ next: total_found += add_new_free_space(block_group, fs_info, last, block_group->key.objectid + block_group->key.offset); - caching_ctl->progress = (u64)-1; - spin_lock(&block_group->lock); block_group->caching_ctl = NULL; block_group->cached = BTRFS_CACHE_FINISHED; spin_unlock(&block_group->lock); +#ifdef CONFIG_BTRFS_DEBUG + if (btrfs_should_fragment_free_space(extent_root, block_group)) { + u64 bytes_used; + + spin_lock(&block_group->space_info->lock); + spin_lock(&block_group->lock); + bytes_used = block_group->key.offset - + btrfs_block_group_used(&block_group->item); + block_group->space_info->bytes_used += bytes_used >> 1; + spin_unlock(&block_group->lock); + spin_unlock(&block_group->space_info->lock); + fragment_free_space(extent_root, block_group); + } +#endif + + caching_ctl->progress = (u64)-1; err: btrfs_free_path(path); up_read(&fs_info->commit_root_sem); @@ -607,6 +654,22 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, } } spin_unlock(&cache->lock); +#ifdef CONFIG_BTRFS_DEBUG + if (ret == 1 && + btrfs_should_fragment_free_space(fs_info->extent_root, + cache)) { + u64 bytes_used; + + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + bytes_used = cache->key.offset - + btrfs_block_group_used(&cache->item); + cache->space_info->bytes_used += bytes_used >> 1; + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + fragment_free_space(fs_info->extent_root, cache); + } +#endif mutex_unlock(&caching_ctl->mutex); wake_up(&caching_ctl->wait); @@ -2009,8 +2072,7 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner, u64 offset, - int no_quota) + u64 root_objectid, u64 owner, u64 offset) { int ret; struct btrfs_fs_info *fs_info = root->fs_info; @@ -2022,12 +2084,12 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, num_bytes, parent, root_objectid, (int)owner, - BTRFS_ADD_DELAYED_REF, NULL, no_quota); + BTRFS_ADD_DELAYED_REF, NULL); } else { ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, - num_bytes, - parent, root_objectid, owner, offset, - BTRFS_ADD_DELAYED_REF, NULL, no_quota); + num_bytes, parent, root_objectid, + owner, offset, 0, + BTRFS_ADD_DELAYED_REF, NULL); } return ret; } @@ -2048,15 +2110,11 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, u64 num_bytes = node->num_bytes; u64 refs; int ret; - int no_quota = node->no_quota; path = btrfs_alloc_path(); if (!path) return -ENOMEM; - if (!is_fstree(root_objectid) || !root->fs_info->quota_enabled) - no_quota = 1; - path->reada = 1; path->leave_spinning = 1; /* this will setup the path even if it fails to insert the back ref */ @@ -2291,8 +2349,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, parent, ref_root, extent_op->flags_to_set, &extent_op->key, - ref->level, &ins, - node->no_quota); + ref->level, &ins); } else if (node->action == BTRFS_ADD_DELAYED_REF) { ret = __btrfs_inc_extent_ref(trans, root, node, parent, ref_root, @@ -2345,6 +2402,11 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, node->num_bytes); } } + + /* Also free its reserved qgroup space */ + btrfs_qgroup_free_delayed_ref(root->fs_info, + head->qgroup_ref_root, + head->qgroup_reserved); return ret; } @@ -2433,7 +2495,21 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, } } + /* + * We need to try and merge add/drops of the same ref since we + * can run into issues with relocate dropping the implicit ref + * and then it being added back again before the drop can + * finish. If we merged anything we need to re-loop so we can + * get a good ref. + * Or we can get node references of the same type that weren't + * merged when created due to bumps in the tree mod seq, and + * we need to merge them to prevent adding an inline extent + * backref before dropping it (triggering a BUG_ON at + * insert_inline_extent_backref()). + */ spin_lock(&locked_ref->lock); + btrfs_merge_delayed_refs(trans, fs_info, delayed_refs, + locked_ref); /* * locked_ref is the head node, so we have to go one @@ -3109,7 +3185,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, int level; int ret = 0; int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, - u64, u64, u64, u64, u64, u64, int); + u64, u64, u64, u64, u64, u64); if (btrfs_test_is_dummy_root(root)) @@ -3150,15 +3226,14 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, key.offset -= btrfs_file_extent_offset(buf, fi); ret = process_func(trans, root, bytenr, num_bytes, parent, ref_root, key.objectid, - key.offset, 1); + key.offset); if (ret) goto fail; } else { bytenr = btrfs_node_blockptr(buf, i); num_bytes = root->nodesize; ret = process_func(trans, root, bytenr, num_bytes, - parent, ref_root, level - 1, 0, - 1); + parent, ref_root, level - 1, 0); if (ret) goto fail; } @@ -3339,6 +3414,15 @@ again: spin_unlock(&block_group->lock); /* + * We hit an ENOSPC when setting up the cache in this transaction, just + * skip doing the setup, we've already cleared the cache so we're safe. + */ + if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) { + ret = -ENOSPC; + goto out_put; + } + + /* * Try to preallocate enough space based on how big the block group is. * Keep in mind this has to include any pinned space which could end up * taking up quite a bit since it's not folded into the other space @@ -3351,16 +3435,26 @@ again: num_pages *= 16; num_pages *= PAGE_CACHE_SIZE; - ret = btrfs_check_data_free_space(inode, num_pages, num_pages); + ret = btrfs_check_data_free_space(inode, 0, num_pages); if (ret) goto out_put; ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages, num_pages, num_pages, &alloc_hint); + /* + * Our cache requires contiguous chunks so that we don't modify a bunch + * of metadata or split extents when writing the cache out, which means + * we can enospc if we are heavily fragmented in addition to just normal + * out of space conditions. So if we hit this just skip setting up any + * other block groups for this transaction, maybe we'll unpin enough + * space the next time around. + */ if (!ret) dcs = BTRFS_DC_SETUP; - btrfs_free_reserved_data_space(inode, num_pages); + else if (ret == -ENOSPC) + set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags); + btrfs_free_reserved_data_space(inode, 0, num_pages); out_put: iput(inode); @@ -3746,6 +3840,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->bytes_readonly = 0; found->bytes_may_use = 0; found->full = 0; + found->max_extent_size = 0; found->force_alloc = CHUNK_ALLOC_NO_FORCE; found->chunk_alloc = 0; found->flush = 0; @@ -3822,7 +3917,8 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) { u64 num_devices = root->fs_info->fs_devices->rw_devices; u64 target; - u64 tmp; + u64 raid_type; + u64 allowed = 0; /* * see if restripe for this chunk_type is in progress, if so @@ -3840,31 +3936,26 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) spin_unlock(&root->fs_info->balance_lock); /* First, mask out the RAID levels which aren't possible */ - if (num_devices == 1) - flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_BLOCK_GROUP_RAID5); - if (num_devices < 3) - flags &= ~BTRFS_BLOCK_GROUP_RAID6; - if (num_devices < 4) - flags &= ~BTRFS_BLOCK_GROUP_RAID10; - - tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10); - flags &= ~tmp; - - if (tmp & BTRFS_BLOCK_GROUP_RAID6) - tmp = BTRFS_BLOCK_GROUP_RAID6; - else if (tmp & BTRFS_BLOCK_GROUP_RAID5) - tmp = BTRFS_BLOCK_GROUP_RAID5; - else if (tmp & BTRFS_BLOCK_GROUP_RAID10) - tmp = BTRFS_BLOCK_GROUP_RAID10; - else if (tmp & BTRFS_BLOCK_GROUP_RAID1) - tmp = BTRFS_BLOCK_GROUP_RAID1; - else if (tmp & BTRFS_BLOCK_GROUP_RAID0) - tmp = BTRFS_BLOCK_GROUP_RAID0; - - return extended_to_chunk(flags | tmp); + for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) { + if (num_devices >= btrfs_raid_array[raid_type].devs_min) + allowed |= btrfs_raid_group[raid_type]; + } + allowed &= flags; + + if (allowed & BTRFS_BLOCK_GROUP_RAID6) + allowed = BTRFS_BLOCK_GROUP_RAID6; + else if (allowed & BTRFS_BLOCK_GROUP_RAID5) + allowed = BTRFS_BLOCK_GROUP_RAID5; + else if (allowed & BTRFS_BLOCK_GROUP_RAID10) + allowed = BTRFS_BLOCK_GROUP_RAID10; + else if (allowed & BTRFS_BLOCK_GROUP_RAID1) + allowed = BTRFS_BLOCK_GROUP_RAID1; + else if (allowed & BTRFS_BLOCK_GROUP_RAID0) + allowed = BTRFS_BLOCK_GROUP_RAID0; + + flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK; + + return extended_to_chunk(flags | allowed); } static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags) @@ -3903,11 +3994,7 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) return ret; } -/* - * This will check the space that the inode allocates from to make sure we have - * enough space for bytes. - */ -int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes) +int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes) { struct btrfs_space_info *data_sinfo; struct btrfs_root *root = BTRFS_I(inode)->root; @@ -4006,7 +4093,8 @@ commit_trans: if (IS_ERR(trans)) return PTR_ERR(trans); if (have_pinned_space >= 0 || - trans->transaction->have_free_bgs || + test_bit(BTRFS_TRANS_HAVE_FREE_BGS, + &trans->transaction->flags) || need_commit > 0) { ret = btrfs_commit_transaction(trans, root); if (ret) @@ -4028,38 +4116,86 @@ commit_trans: data_sinfo->flags, bytes, 1); return -ENOSPC; } - ret = btrfs_qgroup_reserve(root, write_bytes); - if (ret) - goto out; data_sinfo->bytes_may_use += bytes; trace_btrfs_space_reservation(root->fs_info, "space_info", data_sinfo->flags, bytes, 1); -out: spin_unlock(&data_sinfo->lock); return ret; } /* - * Called if we need to clear a data reservation for this inode. + * New check_data_free_space() with ability for precious data reservation + * Will replace old btrfs_check_data_free_space(), but for patch split, + * add a new function first and then replace it. + */ +int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + /* align the range */ + len = round_up(start + len, root->sectorsize) - + round_down(start, root->sectorsize); + start = round_down(start, root->sectorsize); + + ret = btrfs_alloc_data_chunk_ondemand(inode, len); + if (ret < 0) + return ret; + + /* + * Use new btrfs_qgroup_reserve_data to reserve precious data space + * + * TODO: Find a good method to avoid reserve data space for NOCOW + * range, but don't impact performance on quota disable case. + */ + ret = btrfs_qgroup_reserve_data(inode, start, len); + return ret; +} + +/* + * Called if we need to clear a data reservation for this inode + * Normally in a error case. + * + * This one will *NOT* use accurate qgroup reserved space API, just for case + * which we can't sleep and is sure it won't affect qgroup reserved space. + * Like clear_bit_hook(). */ -void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) +void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, + u64 len) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_space_info *data_sinfo; - /* make sure bytes are sectorsize aligned */ - bytes = ALIGN(bytes, root->sectorsize); + /* Make sure the range is aligned to sectorsize */ + len = round_up(start + len, root->sectorsize) - + round_down(start, root->sectorsize); + start = round_down(start, root->sectorsize); data_sinfo = root->fs_info->data_sinfo; spin_lock(&data_sinfo->lock); - WARN_ON(data_sinfo->bytes_may_use < bytes); - data_sinfo->bytes_may_use -= bytes; + if (WARN_ON(data_sinfo->bytes_may_use < len)) + data_sinfo->bytes_may_use = 0; + else + data_sinfo->bytes_may_use -= len; trace_btrfs_space_reservation(root->fs_info, "space_info", - data_sinfo->flags, bytes, 0); + data_sinfo->flags, len, 0); spin_unlock(&data_sinfo->lock); } +/* + * Called if we need to clear a data reservation for this inode + * Normally in a error case. + * + * This one will handle the per-indoe data rsv map for accurate reserved + * space framework. + */ +void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len) +{ + btrfs_free_reserved_data_space_noquota(inode, start, len); + btrfs_qgroup_free_data(inode, start, len); +} + static void force_metadata_allocation(struct btrfs_fs_info *info) { struct list_head *head = &info->space_info; @@ -4891,13 +5027,9 @@ static struct btrfs_block_rsv *get_block_rsv( { struct btrfs_block_rsv *block_rsv = NULL; - if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) - block_rsv = trans->block_rsv; - - if (root == root->fs_info->csum_root && trans->adding_csums) - block_rsv = trans->block_rsv; - - if (root == root->fs_info->uuid_root) + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || + (root == root->fs_info->csum_root && trans->adding_csums) || + (root == root->fs_info->uuid_root)) block_rsv = trans->block_rsv; if (!block_rsv) @@ -5340,7 +5472,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, if (root->fs_info->quota_enabled) { /* One for parent inode, two for dir entries */ num_bytes = 3 * root->nodesize; - ret = btrfs_qgroup_reserve(root, num_bytes); + ret = btrfs_qgroup_reserve_meta(root, num_bytes); if (ret) return ret; } else { @@ -5358,10 +5490,8 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, if (ret == -ENOSPC && use_global_rsv) ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes); - if (ret) { - if (*qgroup_reserved) - btrfs_qgroup_free(root, *qgroup_reserved); - } + if (ret && *qgroup_reserved) + btrfs_qgroup_free_meta(root, *qgroup_reserved); return ret; } @@ -5522,15 +5652,15 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) spin_unlock(&BTRFS_I(inode)->lock); if (root->fs_info->quota_enabled) { - ret = btrfs_qgroup_reserve(root, nr_extents * root->nodesize); + ret = btrfs_qgroup_reserve_meta(root, + nr_extents * root->nodesize); if (ret) goto out_fail; } ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); if (unlikely(ret)) { - if (root->fs_info->quota_enabled) - btrfs_qgroup_free(root, nr_extents * root->nodesize); + btrfs_qgroup_free_meta(root, nr_extents * root->nodesize); goto out_fail; } @@ -5653,41 +5783,48 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) } /** - * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc + * btrfs_delalloc_reserve_space - reserve data and metadata space for + * delalloc * @inode: inode we're writing to - * @num_bytes: the number of bytes we want to allocate + * @start: start range we are writing to + * @len: how long the range we are writing to + * + * TODO: This function will finally replace old btrfs_delalloc_reserve_space() * * This will do the following things * - * o reserve space in the data space info for num_bytes - * o reserve space in the metadata space info based on number of outstanding + * o reserve space in data space info for num bytes + * and reserve precious corresponding qgroup space + * (Done in check_data_free_space) + * + * o reserve space for metadata space, based on the number of outstanding * extents and how much csums will be needed - * o add to the inodes ->delalloc_bytes + * also reserve metadata space in a per root over-reserve method. + * o add to the inodes->delalloc_bytes * o add it to the fs_info's delalloc inodes list. + * (Above 3 all done in delalloc_reserve_metadata) * - * This will return 0 for success and -ENOSPC if there is no space left. + * Return 0 for success + * Return <0 for error(-ENOSPC or -EQUOT) */ -int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) +int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len) { int ret; - ret = btrfs_check_data_free_space(inode, num_bytes, num_bytes); - if (ret) - return ret; - - ret = btrfs_delalloc_reserve_metadata(inode, num_bytes); - if (ret) { - btrfs_free_reserved_data_space(inode, num_bytes); + ret = btrfs_check_data_free_space(inode, start, len); + if (ret < 0) return ret; - } - - return 0; + ret = btrfs_delalloc_reserve_metadata(inode, len); + if (ret < 0) + btrfs_free_reserved_data_space(inode, start, len); + return ret; } /** * btrfs_delalloc_release_space - release data and metadata space for delalloc * @inode: inode we're releasing space for - * @num_bytes: the number of bytes we want to free up + * @start: start position of the space already reserved + * @len: the len of the space already reserved * * This must be matched with a call to btrfs_delalloc_reserve_space. This is * called in the case that we don't need the metadata AND data reservations @@ -5696,11 +5833,12 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) * This function will release the metadata space that was not used and will * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes * list if there are no delalloc bytes left. + * Also it will handle the qgroup reserved space. */ -void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes) +void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len) { - btrfs_delalloc_release_metadata(inode, num_bytes); - btrfs_free_reserved_data_space(inode, num_bytes); + btrfs_delalloc_release_metadata(inode, len); + btrfs_free_reserved_data_space(inode, start, len); } static int update_block_group(struct btrfs_trans_handle *trans, @@ -6065,6 +6203,34 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, update_global_block_rsv(fs_info); } +/* + * Returns the free cluster for the given space info and sets empty_cluster to + * what it should be based on the mount options. + */ +static struct btrfs_free_cluster * +fetch_cluster_info(struct btrfs_root *root, struct btrfs_space_info *space_info, + u64 *empty_cluster) +{ + struct btrfs_free_cluster *ret = NULL; + bool ssd = btrfs_test_opt(root, SSD); + + *empty_cluster = 0; + if (btrfs_mixed_space_info(space_info)) + return ret; + + if (ssd) + *empty_cluster = 2 * 1024 * 1024; + if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { + ret = &root->fs_info->meta_alloc_cluster; + if (!ssd) + *empty_cluster = 64 * 1024; + } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && ssd) { + ret = &root->fs_info->data_alloc_cluster; + } + + return ret; +} + static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end, const bool return_free_space) { @@ -6072,7 +6238,10 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end, struct btrfs_block_group_cache *cache = NULL; struct btrfs_space_info *space_info; struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + struct btrfs_free_cluster *cluster = NULL; u64 len; + u64 total_unpinned = 0; + u64 empty_cluster = 0; bool readonly; while (start <= end) { @@ -6081,8 +6250,14 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end, start >= cache->key.objectid + cache->key.offset) { if (cache) btrfs_put_block_group(cache); + total_unpinned = 0; cache = btrfs_lookup_block_group(fs_info, start); BUG_ON(!cache); /* Logic error */ + + cluster = fetch_cluster_info(root, + cache->space_info, + &empty_cluster); + empty_cluster <<= 1; } len = cache->key.objectid + cache->key.offset - start; @@ -6095,12 +6270,27 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end, } start += len; + total_unpinned += len; space_info = cache->space_info; + /* + * If this space cluster has been marked as fragmented and we've + * unpinned enough in this block group to potentially allow a + * cluster to be created inside of it go ahead and clear the + * fragmented check. + */ + if (cluster && cluster->fragmented && + total_unpinned > empty_cluster) { + spin_lock(&cluster->lock); + cluster->fragmented = 0; + spin_unlock(&cluster->lock); + } + spin_lock(&space_info->lock); spin_lock(&cache->lock); cache->pinned -= len; space_info->bytes_pinned -= len; + space_info->max_extent_size = 0; percpu_counter_add(&space_info->total_bytes_pinned, -len); if (cache->ro) { space_info->bytes_readonly += len; @@ -6233,7 +6423,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, int extent_slot = 0; int found_extent = 0; int num_to_del = 1; - int no_quota = node->no_quota; u32 item_size; u64 refs; u64 bytenr = node->bytenr; @@ -6242,9 +6431,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, bool skinny_metadata = btrfs_fs_incompat(root->fs_info, SKINNY_METADATA); - if (!info->quota_enabled || !is_fstree(root_objectid)) - no_quota = 1; - path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -6570,7 +6756,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, buf->start, buf->len, parent, root->root_key.objectid, btrfs_header_level(buf), - BTRFS_DROP_DELAYED_REF, NULL, 0); + BTRFS_DROP_DELAYED_REF, NULL); BUG_ON(ret); /* -ENOMEM */ } @@ -6618,7 +6804,7 @@ out: /* Can return -ENOMEM */ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, - u64 owner, u64 offset, int no_quota) + u64 owner, u64 offset) { int ret; struct btrfs_fs_info *fs_info = root->fs_info; @@ -6641,13 +6827,13 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, num_bytes, parent, root_objectid, (int)owner, - BTRFS_DROP_DELAYED_REF, NULL, no_quota); + BTRFS_DROP_DELAYED_REF, NULL); } else { ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, num_bytes, parent, root_objectid, owner, - offset, BTRFS_DROP_DELAYED_REF, - NULL, no_quota); + offset, 0, + BTRFS_DROP_DELAYED_REF, NULL); } return ret; } @@ -6833,7 +7019,7 @@ static noinline int find_free_extent(struct btrfs_root *orig_root, struct btrfs_block_group_cache *block_group = NULL; u64 search_start = 0; u64 max_extent_size = 0; - int empty_cluster = 2 * 1024 * 1024; + u64 empty_cluster = 0; struct btrfs_space_info *space_info; int loop = 0; int index = __get_raid_index(flags); @@ -6843,6 +7029,8 @@ static noinline int find_free_extent(struct btrfs_root *orig_root, bool failed_alloc = false; bool use_cluster = true; bool have_caching_bg = false; + bool orig_have_caching_bg = false; + bool full_search = false; WARN_ON(num_bytes < root->sectorsize); ins->type = BTRFS_EXTENT_ITEM_KEY; @@ -6858,36 +7046,47 @@ static noinline int find_free_extent(struct btrfs_root *orig_root, } /* - * If the space info is for both data and metadata it means we have a - * small filesystem and we can't use the clustering stuff. + * If our free space is heavily fragmented we may not be able to make + * big contiguous allocations, so instead of doing the expensive search + * for free space, simply return ENOSPC with our max_extent_size so we + * can go ahead and search for a more manageable chunk. + * + * If our max_extent_size is large enough for our allocation simply + * disable clustering since we will likely not be able to find enough + * space to create a cluster and induce latency trying. */ - if (btrfs_mixed_space_info(space_info)) - use_cluster = false; - - if (flags & BTRFS_BLOCK_GROUP_METADATA && use_cluster) { - last_ptr = &root->fs_info->meta_alloc_cluster; - if (!btrfs_test_opt(root, SSD)) - empty_cluster = 64 * 1024; - } - - if ((flags & BTRFS_BLOCK_GROUP_DATA) && use_cluster && - btrfs_test_opt(root, SSD)) { - last_ptr = &root->fs_info->data_alloc_cluster; + if (unlikely(space_info->max_extent_size)) { + spin_lock(&space_info->lock); + if (space_info->max_extent_size && + num_bytes > space_info->max_extent_size) { + ins->offset = space_info->max_extent_size; + spin_unlock(&space_info->lock); + return -ENOSPC; + } else if (space_info->max_extent_size) { + use_cluster = false; + } + spin_unlock(&space_info->lock); } + last_ptr = fetch_cluster_info(orig_root, space_info, &empty_cluster); if (last_ptr) { spin_lock(&last_ptr->lock); if (last_ptr->block_group) hint_byte = last_ptr->window_start; + if (last_ptr->fragmented) { + /* + * We still set window_start so we can keep track of the + * last place we found an allocation to try and save + * some time. + */ + hint_byte = last_ptr->window_start; + use_cluster = false; + } spin_unlock(&last_ptr->lock); } search_start = max(search_start, first_logical_byte(root, 0)); search_start = max(search_start, hint_byte); - - if (!last_ptr) - empty_cluster = 0; - if (search_start == hint_byte) { block_group = btrfs_lookup_block_group(root->fs_info, search_start); @@ -6922,6 +7121,8 @@ static noinline int find_free_extent(struct btrfs_root *orig_root, } search: have_caching_bg = false; + if (index == 0 || index == __get_raid_index(flags)) + full_search = true; down_read(&space_info->groups_sem); list_for_each_entry(block_group, &space_info->block_groups[index], list) { @@ -6955,6 +7156,7 @@ search: have_block_group: cached = block_group_cache_done(block_group); if (unlikely(!cached)) { + have_caching_bg = true; ret = cache_block_group(block_group, 0); BUG_ON(ret < 0); ret = 0; @@ -6969,7 +7171,7 @@ have_block_group: * Ok we want to try and use the cluster allocator, so * lets look there */ - if (last_ptr) { + if (last_ptr && use_cluster) { struct btrfs_block_group_cache *used_block_group; unsigned long aligned_cluster; /* @@ -7095,6 +7297,16 @@ refill_cluster: } unclustered_alloc: + /* + * We are doing an unclustered alloc, set the fragmented flag so + * we don't bother trying to setup a cluster again until we get + * more space. + */ + if (unlikely(last_ptr)) { + spin_lock(&last_ptr->lock); + last_ptr->fragmented = 1; + spin_unlock(&last_ptr->lock); + } spin_lock(&block_group->free_space_ctl->tree_lock); if (cached && block_group->free_space_ctl->free_space < @@ -7127,8 +7339,6 @@ unclustered_alloc: failed_alloc = true; goto have_block_group; } else if (!offset) { - if (!cached) - have_caching_bg = true; goto loop; } checks: @@ -7169,6 +7379,10 @@ loop: } up_read(&space_info->groups_sem); + if ((loop == LOOP_CACHING_NOWAIT) && have_caching_bg + && !orig_have_caching_bg) + orig_have_caching_bg = true; + if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg) goto search; @@ -7185,7 +7399,20 @@ loop: */ if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) { index = 0; - loop++; + if (loop == LOOP_CACHING_NOWAIT) { + /* + * We want to skip the LOOP_CACHING_WAIT step if we + * don't have any unached bgs and we've alrelady done a + * full search through. + */ + if (orig_have_caching_bg || !full_search) + loop = LOOP_CACHING_WAIT; + else + loop = LOOP_ALLOC_CHUNK; + } else { + loop++; + } + if (loop == LOOP_ALLOC_CHUNK) { struct btrfs_trans_handle *trans; int exist = 0; @@ -7203,6 +7430,15 @@ loop: ret = do_chunk_alloc(trans, root, flags, CHUNK_ALLOC_FORCE); + + /* + * If we can't allocate a new chunk we've already looped + * through at least once, move on to the NO_EMPTY_SIZE + * case. + */ + if (ret == -ENOSPC) + loop = LOOP_NO_EMPTY_SIZE; + /* * Do not bail out on ENOSPC since we * can do more things. @@ -7219,6 +7455,15 @@ loop: } if (loop == LOOP_NO_EMPTY_SIZE) { + /* + * Don't loop again if we already have no empty_size and + * no empty_cluster. + */ + if (empty_size == 0 && + empty_cluster == 0) { + ret = -ENOSPC; + goto out; + } empty_size = 0; empty_cluster = 0; } @@ -7227,11 +7472,20 @@ loop: } else if (!ins->objectid) { ret = -ENOSPC; } else if (ins->objectid) { + if (!use_cluster && last_ptr) { + spin_lock(&last_ptr->lock); + last_ptr->window_start = ins->objectid; + spin_unlock(&last_ptr->lock); + } ret = 0; } out: - if (ret == -ENOSPC) + if (ret == -ENOSPC) { + spin_lock(&space_info->lock); + space_info->max_extent_size = max_extent_size; + spin_unlock(&space_info->lock); ins->offset = max_extent_size; + } return ret; } @@ -7280,7 +7534,7 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 empty_size, u64 hint_byte, struct btrfs_key *ins, int is_data, int delalloc) { - bool final_tried = false; + bool final_tried = num_bytes == min_alloc_size; u64 flags; int ret; @@ -7429,8 +7683,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 parent, u64 root_objectid, u64 flags, struct btrfs_disk_key *key, - int level, struct btrfs_key *ins, - int no_quota) + int level, struct btrfs_key *ins) { int ret; struct btrfs_fs_info *fs_info = root->fs_info; @@ -7511,7 +7764,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 root_objectid, u64 owner, - u64 offset, struct btrfs_key *ins) + u64 offset, u64 ram_bytes, + struct btrfs_key *ins) { int ret; @@ -7520,7 +7774,8 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid, ins->offset, 0, root_objectid, owner, offset, - BTRFS_ADD_DELAYED_EXTENT, NULL, 0); + ram_bytes, BTRFS_ADD_DELAYED_EXTENT, + NULL); return ret; } @@ -7734,7 +7989,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, ins.objectid, ins.offset, parent, root_objectid, level, BTRFS_ADD_DELAYED_EXTENT, - extent_op, 0); + extent_op); if (ret) goto out_free_delayed; } @@ -8275,14 +8530,15 @@ skip: ret = account_shared_subtree(trans, root, next, generation, level - 1); if (ret) { - printk_ratelimited(KERN_ERR "BTRFS: %s Error " + btrfs_err_rl(root->fs_info, + "Error " "%d accounting shared subtree. Quota " - "is out of sync, rescan required.\n", - root->fs_info->sb->s_id, ret); + "is out of sync, rescan required.", + ret); } } ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, - root->root_key.objectid, level - 1, 0, 0); + root->root_key.objectid, level - 1, 0); BUG_ON(ret); /* -ENOMEM */ } btrfs_tree_unlock(next); @@ -8367,10 +8623,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, BUG_ON(ret); /* -ENOMEM */ ret = account_leaf_items(trans, root, eb); if (ret) { - printk_ratelimited(KERN_ERR "BTRFS: %s Error " + btrfs_err_rl(root->fs_info, + "error " "%d accounting leaf items. Quota " - "is out of sync, rescan required.\n", - root->fs_info->sb->s_id, ret); + "is out of sync, rescan required.", + ret); } } /* make block locked assertion in clean_tree_block happy */ @@ -8692,7 +8949,7 @@ out: if (!for_reloc && root_dropped == false) btrfs_add_dead_root(root); if (err && err != -EAGAIN) - btrfs_std_error(root->fs_info, err); + btrfs_std_error(root->fs_info, err, NULL); return err; } @@ -8880,7 +9137,7 @@ again: * back off and let this transaction commit */ mutex_lock(&root->fs_info->ro_block_group_mutex); - if (trans->transaction->dirty_bg_run) { + if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) { u64 transid = trans->transid; mutex_unlock(&root->fs_info->ro_block_group_mutex); @@ -9630,6 +9887,14 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, free_excluded_extents(root, cache); +#ifdef CONFIG_BTRFS_DEBUG + if (btrfs_should_fragment_free_space(root, cache)) { + u64 new_bytes_used = size - bytes_used; + + bytes_used += new_bytes_used >> 1; + fragment_free_space(root, cache); + } +#endif /* * Call to ensure the corresponding space_info object is created and * assigned to our block group, but don't update its counters just yet. @@ -10370,8 +10635,7 @@ void btrfs_end_write_no_snapshoting(struct btrfs_root *root) { percpu_counter_dec(&root->subv_writers->counter); /* - * Make sure counter is updated before we wake up - * waiters. + * Make sure counter is updated before we wake up waiters. */ smp_mb(); if (waitqueue_active(&root->subv_writers->wait)) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 3915c94..33a01ea 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -96,8 +96,8 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller, inode = tree->mapping->host; isize = i_size_read(inode); if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { - printk_ratelimited(KERN_DEBUG - "BTRFS: %s: ino %llu isize %llu odd range [%llu,%llu]\n", + btrfs_debug_rl(BTRFS_I(inode)->root->fs_info, + "%s: ino %llu isize %llu odd range [%llu,%llu]", caller, btrfs_ino(inode), isize, start, end); } } @@ -131,6 +131,25 @@ struct extent_page_data { unsigned int sync_io:1; }; +static void add_extent_changeset(struct extent_state *state, unsigned bits, + struct extent_changeset *changeset, + int set) +{ + int ret; + + if (!changeset) + return; + if (set && (state->state & bits) == bits) + return; + if (!set && (state->state & bits) == 0) + return; + changeset->bytes_changed += state->end - state->start + 1; + ret = ulist_add(changeset->range_changed, state->start, state->end, + GFP_ATOMIC); + /* ENOMEM */ + BUG_ON(ret < 0); +} + static noinline void flush_write_bio(void *data); static inline struct btrfs_fs_info * tree_fs_info(struct extent_io_tree *tree) @@ -410,7 +429,8 @@ static void clear_state_cb(struct extent_io_tree *tree, } static void set_state_bits(struct extent_io_tree *tree, - struct extent_state *state, unsigned *bits); + struct extent_state *state, unsigned *bits, + struct extent_changeset *changeset); /* * insert an extent_state struct into the tree. 'bits' are set on the @@ -426,7 +446,7 @@ static int insert_state(struct extent_io_tree *tree, struct extent_state *state, u64 start, u64 end, struct rb_node ***p, struct rb_node **parent, - unsigned *bits) + unsigned *bits, struct extent_changeset *changeset) { struct rb_node *node; @@ -436,7 +456,7 @@ static int insert_state(struct extent_io_tree *tree, state->start = start; state->end = end; - set_state_bits(tree, state, bits); + set_state_bits(tree, state, bits, changeset); node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent); if (node) { @@ -511,7 +531,8 @@ static struct extent_state *next_state(struct extent_state *state) */ static struct extent_state *clear_state_bit(struct extent_io_tree *tree, struct extent_state *state, - unsigned *bits, int wake) + unsigned *bits, int wake, + struct extent_changeset *changeset) { struct extent_state *next; unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS; @@ -522,6 +543,7 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree, tree->dirty_bytes -= range; } clear_state_cb(tree, state, bits); + add_extent_changeset(state, bits_to_clear, changeset, 0); state->state &= ~bits_to_clear; if (wake) wake_up(&state->wq); @@ -569,10 +591,10 @@ static void extent_io_tree_panic(struct extent_io_tree *tree, int err) * * This takes the tree lock, and returns 0 on success and < 0 on error. */ -int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, int wake, int delete, - struct extent_state **cached_state, - gfp_t mask) +static int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits, int wake, int delete, + struct extent_state **cached_state, + gfp_t mask, struct extent_changeset *changeset) { struct extent_state *state; struct extent_state *cached; @@ -671,7 +693,8 @@ hit_next: if (err) goto out; if (state->end <= end) { - state = clear_state_bit(tree, state, &bits, wake); + state = clear_state_bit(tree, state, &bits, wake, + changeset); goto next; } goto search_again; @@ -692,13 +715,13 @@ hit_next: if (wake) wake_up(&state->wq); - clear_state_bit(tree, prealloc, &bits, wake); + clear_state_bit(tree, prealloc, &bits, wake, changeset); prealloc = NULL; goto out; } - state = clear_state_bit(tree, state, &bits, wake); + state = clear_state_bit(tree, state, &bits, wake, changeset); next: if (last_end == (u64)-1) goto out; @@ -789,7 +812,7 @@ out: static void set_state_bits(struct extent_io_tree *tree, struct extent_state *state, - unsigned *bits) + unsigned *bits, struct extent_changeset *changeset) { unsigned bits_to_set = *bits & ~EXTENT_CTLBITS; @@ -798,6 +821,7 @@ static void set_state_bits(struct extent_io_tree *tree, u64 range = state->end - state->start + 1; tree->dirty_bytes += range; } + add_extent_changeset(state, bits_to_set, changeset, 1); state->state |= bits_to_set; } @@ -835,7 +859,7 @@ static int __must_check __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, unsigned exclusive_bits, u64 *failed_start, struct extent_state **cached_state, - gfp_t mask) + gfp_t mask, struct extent_changeset *changeset) { struct extent_state *state; struct extent_state *prealloc = NULL; @@ -873,7 +897,7 @@ again: prealloc = alloc_extent_state_atomic(prealloc); BUG_ON(!prealloc); err = insert_state(tree, prealloc, start, end, - &p, &parent, &bits); + &p, &parent, &bits, changeset); if (err) extent_io_tree_panic(tree, err); @@ -899,7 +923,7 @@ hit_next: goto out; } - set_state_bits(tree, state, &bits); + set_state_bits(tree, state, &bits, changeset); cache_state(state, cached_state); merge_state(tree, state); if (last_end == (u64)-1) @@ -945,7 +969,7 @@ hit_next: if (err) goto out; if (state->end <= end) { - set_state_bits(tree, state, &bits); + set_state_bits(tree, state, &bits, changeset); cache_state(state, cached_state); merge_state(tree, state); if (last_end == (u64)-1) @@ -980,7 +1004,7 @@ hit_next: * the later extent. */ err = insert_state(tree, prealloc, start, this_end, - NULL, NULL, &bits); + NULL, NULL, &bits, changeset); if (err) extent_io_tree_panic(tree, err); @@ -1008,7 +1032,7 @@ hit_next: if (err) extent_io_tree_panic(tree, err); - set_state_bits(tree, prealloc, &bits); + set_state_bits(tree, prealloc, &bits, changeset); cache_state(prealloc, cached_state); merge_state(tree, prealloc); prealloc = NULL; @@ -1038,7 +1062,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask) { return __set_extent_bit(tree, start, end, bits, 0, failed_start, - cached_state, mask); + cached_state, mask, NULL); } @@ -1111,7 +1135,7 @@ again: goto out; } err = insert_state(tree, prealloc, start, end, - &p, &parent, &bits); + &p, &parent, &bits, NULL); if (err) extent_io_tree_panic(tree, err); cache_state(prealloc, cached_state); @@ -1130,9 +1154,9 @@ hit_next: * Just lock what we found and keep going */ if (state->start == start && state->end <= end) { - set_state_bits(tree, state, &bits); + set_state_bits(tree, state, &bits, NULL); cache_state(state, cached_state); - state = clear_state_bit(tree, state, &clear_bits, 0); + state = clear_state_bit(tree, state, &clear_bits, 0, NULL); if (last_end == (u64)-1) goto out; start = last_end + 1; @@ -1171,9 +1195,10 @@ hit_next: if (err) goto out; if (state->end <= end) { - set_state_bits(tree, state, &bits); + set_state_bits(tree, state, &bits, NULL); cache_state(state, cached_state); - state = clear_state_bit(tree, state, &clear_bits, 0); + state = clear_state_bit(tree, state, &clear_bits, 0, + NULL); if (last_end == (u64)-1) goto out; start = last_end + 1; @@ -1208,7 +1233,7 @@ hit_next: * the later extent. */ err = insert_state(tree, prealloc, start, this_end, - NULL, NULL, &bits); + NULL, NULL, &bits, NULL); if (err) extent_io_tree_panic(tree, err); cache_state(prealloc, cached_state); @@ -1233,9 +1258,9 @@ hit_next: if (err) extent_io_tree_panic(tree, err); - set_state_bits(tree, prealloc, &bits); + set_state_bits(tree, prealloc, &bits, NULL); cache_state(prealloc, cached_state); - clear_state_bit(tree, prealloc, &clear_bits, 0); + clear_state_bit(tree, prealloc, &clear_bits, 0, NULL); prealloc = NULL; goto out; } @@ -1274,6 +1299,30 @@ int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, NULL, mask); } +int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits, gfp_t mask, + struct extent_changeset *changeset) +{ + /* + * We don't support EXTENT_LOCKED yet, as current changeset will + * record any bits changed, so for EXTENT_LOCKED case, it will + * either fail with -EEXIST or changeset will record the whole + * range. + */ + BUG_ON(bits & EXTENT_LOCKED); + + return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, mask, + changeset); +} + +int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits, int wake, int delete, + struct extent_state **cached, gfp_t mask) +{ + return __clear_extent_bit(tree, start, end, bits, wake, delete, + cached, mask, NULL); +} + int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, gfp_t mask) { @@ -1285,6 +1334,20 @@ int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, mask); } +int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits, gfp_t mask, + struct extent_changeset *changeset) +{ + /* + * Don't support EXTENT_LOCKED case, same reason as + * set_record_extent_bits(). + */ + BUG_ON(bits & EXTENT_LOCKED); + + return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask, + changeset); +} + int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask) { @@ -1343,7 +1406,7 @@ int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, while (1) { err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits, EXTENT_LOCKED, &failed_start, - cached_state, GFP_NOFS); + cached_state, GFP_NOFS, NULL); if (err == -EEXIST) { wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); start = failed_start; @@ -1365,7 +1428,7 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) u64 failed_start; err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, - &failed_start, NULL, GFP_NOFS); + &failed_start, NULL, GFP_NOFS, NULL); if (err == -EEXIST) { if (failed_start > start) clear_extent_bit(tree, start, failed_start - 1, @@ -2078,8 +2141,8 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, return -EIO; } - printk_ratelimited_in_rcu(KERN_INFO - "BTRFS: read error corrected: ino %llu off %llu (dev %s sector %llu)\n", + btrfs_info_rl_in_rcu(fs_info, + "read error corrected: ino %llu off %llu (dev %s sector %llu)", btrfs_ino(inode), start, rcu_str_deref(dev->name), sector); bio_put(bio); @@ -3070,8 +3133,12 @@ static int __do_readpage(struct extent_io_tree *tree, set_extent_uptodate(tree, cur, cur + iosize - 1, &cached, GFP_NOFS); - unlock_extent_cached(tree, cur, cur + iosize - 1, - &cached, GFP_NOFS); + if (parent_locked) + free_extent_state(cached); + else + unlock_extent_cached(tree, cur, + cur + iosize - 1, + &cached, GFP_NOFS); cur = cur + iosize; pg_offset += iosize; continue; @@ -5566,13 +5633,15 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_i; if (src_offset + len > dst->len) { - printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move " - "len %lu dst len %lu\n", src_offset, len, dst->len); + btrfs_err(dst->fs_info, + "memmove bogus src_offset %lu move " + "len %lu dst len %lu", src_offset, len, dst->len); BUG_ON(1); } if (dst_offset + len > dst->len) { - printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move " - "len %lu dst len %lu\n", dst_offset, len, dst->len); + btrfs_err(dst->fs_info, + "memmove bogus dst_offset %lu move " + "len %lu dst len %lu", dst_offset, len, dst->len); BUG_ON(1); } @@ -5612,13 +5681,13 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_i; if (src_offset + len > dst->len) { - printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move " - "len %lu len %lu\n", src_offset, len, dst->len); + btrfs_err(dst->fs_info, "memmove bogus src_offset %lu move " + "len %lu len %lu", src_offset, len, dst->len); BUG_ON(1); } if (dst_offset + len > dst->len) { - printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move " - "len %lu len %lu\n", dst_offset, len, dst->len); + btrfs_err(dst->fs_info, "memmove bogus dst_offset %lu move " + "len %lu len %lu", dst_offset, len, dst->len); BUG_ON(1); } if (dst_offset < src_offset) { diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index c668f36..f4c1ae1 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -2,6 +2,7 @@ #define __EXTENTIO__ #include <linux/rbtree.h> +#include "ulist.h" /* bits for the extent state */ #define EXTENT_DIRTY (1U << 0) @@ -18,6 +19,7 @@ #define EXTENT_NEED_WAIT (1U << 13) #define EXTENT_DAMAGED (1U << 14) #define EXTENT_NORESERVE (1U << 15) +#define EXTENT_QGROUP_RESERVED (1U << 16) #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) @@ -161,6 +163,17 @@ struct extent_buffer { #endif }; +/* + * Structure to record how many bytes and which ranges are set/cleared + */ +struct extent_changeset { + /* How many bytes are set/cleared in this operation */ + u64 bytes_changed; + + /* Changed ranges */ + struct ulist *range_changed; +}; + static inline void extent_set_compress_type(unsigned long *bio_flags, int compress_type) { @@ -210,11 +223,17 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state *cached_state); int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, gfp_t mask); +int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits, gfp_t mask, + struct extent_changeset *changeset); int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, int wake, int delete, struct extent_state **cached, gfp_t mask); int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, gfp_t mask); +int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits, gfp_t mask, + struct extent_changeset *changeset); int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, u64 *failed_start, struct extent_state **cached_state, gfp_t mask); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 8c6f247..6bd5ce9 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -847,7 +847,7 @@ next_slot: disk_bytenr, num_bytes, 0, root->root_key.objectid, new_key.objectid, - start - extent_offset, 1); + start - extent_offset); BUG_ON(ret); /* -ENOMEM */ } key.offset = start; @@ -925,7 +925,7 @@ delete_extent_item: disk_bytenr, num_bytes, 0, root->root_key.objectid, key.objectid, key.offset - - extent_offset, 0); + extent_offset); BUG_ON(ret); /* -ENOMEM */ inode_sub_bytes(inode, extent_end - key.offset); @@ -1204,7 +1204,7 @@ again: ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, root->root_key.objectid, - ino, orig_offset, 1); + ino, orig_offset); BUG_ON(ret); /* -ENOMEM */ if (split == start) { @@ -1231,7 +1231,7 @@ again: del_nr++; ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 0, root->root_key.objectid, - ino, orig_offset, 0); + ino, orig_offset); BUG_ON(ret); /* -ENOMEM */ } other_start = 0; @@ -1248,7 +1248,7 @@ again: del_nr++; ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 0, root->root_key.objectid, - ino, orig_offset, 0); + ino, orig_offset); BUG_ON(ret); /* -ENOMEM */ } if (del_nr == 0) { @@ -1469,7 +1469,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, u64 release_bytes = 0; u64 lockstart; u64 lockend; - unsigned long first_index; size_t num_written = 0; int nrptrs; int ret = 0; @@ -1485,8 +1484,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, if (!pages) return -ENOMEM; - first_index = pos >> PAGE_CACHE_SHIFT; - while (iov_iter_count(i) > 0) { size_t offset = pos & (PAGE_CACHE_SIZE - 1); size_t write_bytes = min(iov_iter_count(i), @@ -1510,12 +1507,17 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, } reserve_bytes = num_pages << PAGE_CACHE_SHIFT; - ret = btrfs_check_data_free_space(inode, reserve_bytes, write_bytes); - if (ret == -ENOSPC && - (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | - BTRFS_INODE_PREALLOC))) { + + if (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | + BTRFS_INODE_PREALLOC)) { ret = check_can_nocow(inode, pos, &write_bytes); + if (ret < 0) + break; if (ret > 0) { + /* + * For nodata cow case, no need to reserve + * data space. + */ only_release_metadata = true; /* * our prealloc extent may be smaller than @@ -1524,20 +1526,19 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, num_pages = DIV_ROUND_UP(write_bytes + offset, PAGE_CACHE_SIZE); reserve_bytes = num_pages << PAGE_CACHE_SHIFT; - ret = 0; - } else { - ret = -ENOSPC; + goto reserve_metadata; } } - - if (ret) + ret = btrfs_check_data_free_space(inode, pos, write_bytes); + if (ret < 0) break; +reserve_metadata: ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes); if (ret) { if (!only_release_metadata) - btrfs_free_reserved_data_space(inode, - reserve_bytes); + btrfs_free_reserved_data_space(inode, pos, + write_bytes); else btrfs_end_write_no_snapshoting(root); break; @@ -1603,12 +1604,17 @@ again: BTRFS_I(inode)->outstanding_extents++; spin_unlock(&BTRFS_I(inode)->lock); } - if (only_release_metadata) + if (only_release_metadata) { btrfs_delalloc_release_metadata(inode, release_bytes); - else - btrfs_delalloc_release_space(inode, + } else { + u64 __pos; + + __pos = round_down(pos, root->sectorsize) + + (dirty_pages << PAGE_CACHE_SHIFT); + btrfs_delalloc_release_space(inode, __pos, release_bytes); + } } release_bytes = dirty_pages << PAGE_CACHE_SHIFT; @@ -1660,7 +1666,7 @@ again: btrfs_end_write_no_snapshoting(root); btrfs_delalloc_release_metadata(inode, release_bytes); } else { - btrfs_delalloc_release_space(inode, release_bytes); + btrfs_delalloc_release_space(inode, pos, release_bytes); } } @@ -2266,7 +2272,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) u64 drop_end; int ret = 0; int err = 0; - int rsv_count; + unsigned int rsv_count; bool same_page; bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES); u64 ino_size; @@ -2488,6 +2494,19 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) trans->block_rsv = &root->fs_info->trans_block_rsv; /* + * If we are using the NO_HOLES feature we might have had already an + * hole that overlaps a part of the region [lockstart, lockend] and + * ends at (or beyond) lockend. Since we have no file extent items to + * represent holes, drop_end can be less than lockend and so we must + * make sure we have an extent map representing the existing hole (the + * call to __btrfs_drop_extents() might have dropped the existing extent + * map representing the existing hole), otherwise the fast fsync path + * will not record the existence of the hole region + * [existing_hole_start, lockend]. + */ + if (drop_end <= lockend) + drop_end = lockend + 1; + /* * Don't insert file hole extent item if it's for a range beyond eof * (because it's useless) or if it represents a 0 bytes range (when * cur_offset == drop_end). @@ -2541,17 +2560,61 @@ out_only_mutex: return err; } +/* Helper structure to record which range is already reserved */ +struct falloc_range { + struct list_head list; + u64 start; + u64 len; +}; + +/* + * Helper function to add falloc range + * + * Caller should have locked the larger range of extent containing + * [start, len) + */ +static int add_falloc_range(struct list_head *head, u64 start, u64 len) +{ + struct falloc_range *prev = NULL; + struct falloc_range *range = NULL; + + if (list_empty(head)) + goto insert; + + /* + * As fallocate iterate by bytenr order, we only need to check + * the last range. + */ + prev = list_entry(head->prev, struct falloc_range, list); + if (prev->start + prev->len == start) { + prev->len += len; + return 0; + } +insert: + range = kmalloc(sizeof(*range), GFP_NOFS); + if (!range) + return -ENOMEM; + range->start = start; + range->len = len; + list_add_tail(&range->list, head); + return 0; +} + static long btrfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); struct extent_state *cached_state = NULL; + struct falloc_range *range; + struct falloc_range *tmp; + struct list_head reserve_list; u64 cur_offset; u64 last_byte; u64 alloc_start; u64 alloc_end; u64 alloc_hint = 0; u64 locked_end; + u64 actual_end = 0; struct extent_map *em; int blocksize = BTRFS_I(inode)->root->sectorsize; int ret; @@ -2567,11 +2630,12 @@ static long btrfs_fallocate(struct file *file, int mode, return btrfs_punch_hole(inode, offset, len); /* - * Make sure we have enough space before we do the - * allocation. + * Only trigger disk allocation, don't trigger qgroup reserve + * + * For qgroup space, it will be checked later. */ - ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start, alloc_end - alloc_start); - if (ret) + ret = btrfs_alloc_data_chunk_ondemand(inode, alloc_end - alloc_start); + if (ret < 0) return ret; mutex_lock(&inode->i_mutex); @@ -2579,6 +2643,13 @@ static long btrfs_fallocate(struct file *file, int mode, if (ret) goto out; + /* + * TODO: Move these two operations after we have checked + * accurate reserved space, or fallocate can still fail but + * with page truncated or size expanded. + * + * But that's a minor problem and won't do much harm BTW. + */ if (alloc_start > inode->i_size) { ret = btrfs_cont_expand(inode, i_size_read(inode), alloc_start); @@ -2637,10 +2708,10 @@ static long btrfs_fallocate(struct file *file, int mode, } } + /* First, check if we exceed the qgroup limit */ + INIT_LIST_HEAD(&reserve_list); cur_offset = alloc_start; while (1) { - u64 actual_end; - em = btrfs_get_extent(inode, NULL, 0, cur_offset, alloc_end - cur_offset, 0); if (IS_ERR_OR_NULL(em)) { @@ -2653,57 +2724,82 @@ static long btrfs_fallocate(struct file *file, int mode, last_byte = min(extent_map_end(em), alloc_end); actual_end = min_t(u64, extent_map_end(em), offset + len); last_byte = ALIGN(last_byte, blocksize); - if (em->block_start == EXTENT_MAP_HOLE || (cur_offset >= inode->i_size && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { - ret = btrfs_prealloc_file_range(inode, mode, cur_offset, - last_byte - cur_offset, - 1 << inode->i_blkbits, - offset + len, - &alloc_hint); - } else if (actual_end > inode->i_size && - !(mode & FALLOC_FL_KEEP_SIZE)) { - struct btrfs_trans_handle *trans; - struct btrfs_root *root = BTRFS_I(inode)->root; - - /* - * We didn't need to allocate any more space, but we - * still extended the size of the file so we need to - * update i_size and the inode item. - */ - trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - } else { - inode->i_ctime = CURRENT_TIME; - i_size_write(inode, actual_end); - btrfs_ordered_update_i_size(inode, actual_end, - NULL); - ret = btrfs_update_inode(trans, root, inode); - if (ret) - btrfs_end_transaction(trans, root); - else - ret = btrfs_end_transaction(trans, - root); + ret = add_falloc_range(&reserve_list, cur_offset, + last_byte - cur_offset); + if (ret < 0) { + free_extent_map(em); + break; } + ret = btrfs_qgroup_reserve_data(inode, cur_offset, + last_byte - cur_offset); + if (ret < 0) + break; } free_extent_map(em); - if (ret < 0) - break; - cur_offset = last_byte; - if (cur_offset >= alloc_end) { - ret = 0; + if (cur_offset >= alloc_end) break; + } + + /* + * If ret is still 0, means we're OK to fallocate. + * Or just cleanup the list and exit. + */ + list_for_each_entry_safe(range, tmp, &reserve_list, list) { + if (!ret) + ret = btrfs_prealloc_file_range(inode, mode, + range->start, + range->len, 1 << inode->i_blkbits, + offset + len, &alloc_hint); + list_del(&range->list); + kfree(range); + } + if (ret < 0) + goto out_unlock; + + if (actual_end > inode->i_size && + !(mode & FALLOC_FL_KEEP_SIZE)) { + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(inode)->root; + + /* + * We didn't need to allocate any more space, but we + * still extended the size of the file so we need to + * update i_size and the inode item. + */ + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + } else { + inode->i_ctime = CURRENT_TIME; + i_size_write(inode, actual_end); + btrfs_ordered_update_i_size(inode, actual_end, NULL); + ret = btrfs_update_inode(trans, root, inode); + if (ret) + btrfs_end_transaction(trans, root); + else + ret = btrfs_end_transaction(trans, root); } } +out_unlock: unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, &cached_state, GFP_NOFS); out: + /* + * As we waited the extent range, the data_rsv_map must be empty + * in the range, as written data range will be released from it. + * And for prealloacted extent, it will also be released when + * its metadata is written. + * So this is completely used as cleanup. + */ + btrfs_qgroup_free_data(inode, alloc_start, alloc_end - alloc_start); mutex_unlock(&inode->i_mutex); /* Let go of our reservation. */ - btrfs_free_reserved_data_space(inode, alloc_end - alloc_start); + btrfs_free_reserved_data_space(inode, alloc_start, + alloc_end - alloc_start); return ret; } diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index abe3a66..0948d34 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -450,9 +450,9 @@ static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation) gen = io_ctl->cur; if (le64_to_cpu(*gen) != generation) { - printk_ratelimited(KERN_ERR "BTRFS: space cache generation " - "(%Lu) does not match inode (%Lu)\n", *gen, - generation); + btrfs_err_rl(io_ctl->root->fs_info, + "space cache generation (%llu) does not match inode (%llu)", + *gen, generation); io_ctl_unmap_page(io_ctl); return -EIO; } @@ -506,8 +506,8 @@ static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index) PAGE_CACHE_SIZE - offset); btrfs_csum_final(crc, (char *)&crc); if (val != crc) { - printk_ratelimited(KERN_ERR "BTRFS: csum mismatch on free " - "space cache\n"); + btrfs_err_rl(io_ctl->root->fs_info, + "csum mismatch on free space cache"); io_ctl_unmap_page(io_ctl); return -EIO; } @@ -1215,7 +1215,7 @@ out: * @offset - the offset for the key we'll insert * * This function writes out a free space cache struct to disk for quick recovery - * on mount. This will return 0 if it was successfull in writing the cache out, + * on mount. This will return 0 if it was successful in writing the cache out, * or an errno if it was not. */ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, @@ -1730,7 +1730,7 @@ static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl, */ static int search_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *bitmap_info, u64 *offset, - u64 *bytes) + u64 *bytes, bool for_alloc) { unsigned long found_bits = 0; unsigned long max_bits = 0; @@ -1738,11 +1738,26 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl, unsigned long next_zero; unsigned long extent_bits; + /* + * Skip searching the bitmap if we don't have a contiguous section that + * is large enough for this allocation. + */ + if (for_alloc && + bitmap_info->max_extent_size && + bitmap_info->max_extent_size < *bytes) { + *bytes = bitmap_info->max_extent_size; + return -1; + } + i = offset_to_bit(bitmap_info->offset, ctl->unit, max_t(u64, *offset, bitmap_info->offset)); bits = bytes_to_bits(*bytes, ctl->unit); for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) { + if (for_alloc && bits == 1) { + found_bits = 1; + break; + } next_zero = find_next_zero_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i); extent_bits = next_zero - i; @@ -1762,6 +1777,7 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl, } *bytes = (u64)(max_bits) * ctl->unit; + bitmap_info->max_extent_size = *bytes; return -1; } @@ -1813,7 +1829,7 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, if (entry->bitmap) { u64 size = *bytes; - ret = search_bitmap(ctl, entry, &tmp, &size); + ret = search_bitmap(ctl, entry, &tmp, &size, true); if (!ret) { *offset = tmp; *bytes = size; @@ -1874,7 +1890,8 @@ again: search_start = *offset; search_bytes = ctl->unit; search_bytes = min(search_bytes, end - search_start + 1); - ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes); + ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes, + false); if (ret < 0 || search_start != *offset) return -EINVAL; @@ -1919,7 +1936,7 @@ again: search_start = *offset; search_bytes = ctl->unit; ret = search_bitmap(ctl, bitmap_info, &search_start, - &search_bytes); + &search_bytes, false); if (ret < 0 || search_start != *offset) return -EAGAIN; @@ -1943,6 +1960,12 @@ static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl, bitmap_set_bits(ctl, info, offset, bytes_to_set); + /* + * We set some bytes, we have no idea what the max extent size is + * anymore. + */ + info->max_extent_size = 0; + return bytes_to_set; } @@ -1951,12 +1974,19 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info) { struct btrfs_block_group_cache *block_group = ctl->private; + bool forced = false; + +#ifdef CONFIG_BTRFS_DEBUG + if (btrfs_should_fragment_free_space(block_group->fs_info->extent_root, + block_group)) + forced = true; +#endif /* * If we are below the extents threshold then we can add this as an * extent, and don't have to deal with the bitmap */ - if (ctl->free_extents < ctl->extents_thresh) { + if (!forced && ctl->free_extents < ctl->extents_thresh) { /* * If this block group has some small extents we don't want to * use up all of our free slots in the cache with them, we want @@ -2661,7 +2691,7 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group, search_start = min_start; search_bytes = bytes; - err = search_bitmap(ctl, entry, &search_start, &search_bytes); + err = search_bitmap(ctl, entry, &search_start, &search_bytes, true); if (err) { if (search_bytes > *max_extent_size) *max_extent_size = search_bytes; @@ -2775,6 +2805,7 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group, unsigned long want_bits; unsigned long min_bits; unsigned long found_bits; + unsigned long max_bits = 0; unsigned long start = 0; unsigned long total_found = 0; int ret; @@ -2784,6 +2815,13 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group, want_bits = bytes_to_bits(bytes, ctl->unit); min_bits = bytes_to_bits(min_bytes, ctl->unit); + /* + * Don't bother looking for a cluster in this bitmap if it's heavily + * fragmented. + */ + if (entry->max_extent_size && + entry->max_extent_size < cont1_bytes) + return -ENOSPC; again: found_bits = 0; for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) { @@ -2791,13 +2829,19 @@ again: BITS_PER_BITMAP, i); if (next_zero - i >= min_bits) { found_bits = next_zero - i; + if (found_bits > max_bits) + max_bits = found_bits; break; } + if (next_zero - i > max_bits) + max_bits = next_zero - i; i = next_zero; } - if (!found_bits) + if (!found_bits) { + entry->max_extent_size = (u64)max_bits * ctl->unit; return -ENOSPC; + } if (!total_found) { start = i; @@ -3056,6 +3100,7 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) spin_lock_init(&cluster->refill_lock); cluster->root = RB_ROOT; cluster->max_size = 0; + cluster->fragmented = false; INIT_LIST_HEAD(&cluster->block_group_list); cluster->block_group = NULL; } @@ -3223,7 +3268,7 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group, } bytes = minlen; - ret2 = search_bitmap(ctl, entry, &start, &bytes); + ret2 = search_bitmap(ctl, entry, &start, &bytes, false); if (ret2 || start >= end) { spin_unlock(&ctl->tree_lock); mutex_unlock(&ctl->cache_writeout_mutex); @@ -3376,7 +3421,7 @@ u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root) u64 count = 1; int ret; - ret = search_bitmap(ctl, entry, &offset, &count); + ret = search_bitmap(ctl, entry, &offset, &count, true); /* Logic error; Should be empty if it can't find anything */ ASSERT(!ret); @@ -3532,6 +3577,7 @@ again: spin_lock(&ctl->tree_lock); info->offset = offset; info->bytes = bytes; + info->max_extent_size = 0; ret = link_free_space(ctl, info); spin_unlock(&ctl->tree_lock); if (ret) @@ -3559,6 +3605,7 @@ again: } bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes); + bytes -= bytes_added; offset += bytes_added; spin_unlock(&ctl->tree_lock); @@ -3602,7 +3649,7 @@ have_info: bit_off = offset; bit_bytes = ctl->unit; - ret = search_bitmap(ctl, info, &bit_off, &bit_bytes); + ret = search_bitmap(ctl, info, &bit_off, &bit_bytes, false); if (!ret) { if (bit_off == offset) { ret = 1; diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index a16a029..f251865 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -23,6 +23,7 @@ struct btrfs_free_space { struct rb_node offset_index; u64 offset; u64 bytes; + u64 max_extent_size; unsigned long *bitmap; struct list_head list; }; diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 265e03c..be4d22a 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -157,7 +157,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, */ if (!btrfs_find_name_in_ext_backref(path, ref_objectid, name, name_len, &extref)) { - btrfs_std_error(root->fs_info, -ENOENT); + btrfs_std_error(root->fs_info, -ENOENT, NULL); ret = -EROFS; goto out; } diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index d4a582a..767a605 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -488,17 +488,17 @@ again: /* Just to make sure we have enough space */ prealloc += 8 * PAGE_CACHE_SIZE; - ret = btrfs_delalloc_reserve_space(inode, prealloc); + ret = btrfs_delalloc_reserve_space(inode, 0, prealloc); if (ret) goto out_put; ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc, prealloc, prealloc, &alloc_hint); if (ret) { - btrfs_delalloc_release_space(inode, prealloc); + btrfs_delalloc_release_space(inode, 0, prealloc); goto out_put; } - btrfs_free_reserved_data_space(inode, prealloc); + btrfs_free_reserved_data_space(inode, 0, prealloc); ret = btrfs_write_out_ino_cache(root, trans, path, inode); out_put: diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 611b66d..4439fbb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -310,6 +310,13 @@ static noinline int cow_file_range_inline(struct btrfs_root *root, btrfs_delalloc_release_metadata(inode, end + 1 - start); btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); out: + /* + * Don't forget to free the reserved space, as for inlined extent + * it won't count as data extent, free them directly here. + * And at reserve time, it's always aligned to page size, so + * just free one page here. + */ + btrfs_qgroup_free_data(inode, 0, PAGE_CACHE_SIZE); btrfs_free_path(path); btrfs_end_transaction(trans, root); return ret; @@ -1096,6 +1103,9 @@ static noinline void async_cow_submit(struct btrfs_work *work) nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >> PAGE_CACHE_SHIFT; + /* + * atomic_sub_return implies a barrier for waitqueue_active + */ if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) < 5 * 1024 * 1024 && waitqueue_active(&root->fs_info->async_submit_wait)) @@ -1766,7 +1776,8 @@ static void btrfs_clear_bit_hook(struct inode *inode, if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID && do_list && !(state->state & EXTENT_NORESERVE)) - btrfs_free_reserved_data_space(inode, len); + btrfs_free_reserved_data_space_noquota(inode, + state->start, len); __percpu_counter_add(&root->fs_info->delalloc_bytes, -len, root->fs_info->delalloc_batch); @@ -1861,15 +1872,15 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, u64 bio_offset) { struct btrfs_root *root = BTRFS_I(inode)->root; + enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA; int ret = 0; int skip_sum; - int metadata = 0; int async = !atomic_read(&BTRFS_I(inode)->sync_writers); skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; if (btrfs_is_free_space_inode(inode)) - metadata = 2; + metadata = BTRFS_WQ_ENDIO_FREE_SPACE; if (!(rw & REQ_WRITE)) { ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata); @@ -1989,7 +2000,8 @@ again: goto again; } - ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); + ret = btrfs_delalloc_reserve_space(inode, page_start, + PAGE_CACHE_SIZE); if (ret) { mapping_set_error(page->mapping, ret); end_extent_writepage(page, ret, page_start, page_end); @@ -2115,7 +2127,13 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, ins.type = BTRFS_EXTENT_ITEM_KEY; ret = btrfs_alloc_reserved_file_extent(trans, root, root->root_key.objectid, - btrfs_ino(inode), file_pos, &ins); + btrfs_ino(inode), file_pos, + ram_bytes, &ins); + /* + * Release the reserved range from inode dirty range map, as it is + * already moved into delayed_ref_head + */ + btrfs_qgroup_release_data(inode, file_pos, ram_bytes); out: btrfs_free_path(path); @@ -2573,7 +2591,7 @@ again: ret = btrfs_inc_extent_ref(trans, root, new->bytenr, new->disk_len, 0, backref->root_id, backref->inum, - new->file_pos, 0); /* start - extent_offset */ + new->file_pos); /* start - extent_offset */ if (ret) { btrfs_abort_transaction(trans, root, ret); goto out_free_path; @@ -2599,7 +2617,6 @@ static void free_sa_defrag_extent(struct new_sa_defrag_extent *new) return; list_for_each_entry_safe(old, tmp, &new->head, list) { - list_del(&old->list); kfree(old); } kfree(new); @@ -2824,6 +2841,14 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ + + /* + * For mwrite(mmap + memset to write) case, we still reserve + * space for NOCOW range. + * As NOCOW won't cause a new delayed ref, just free the space + */ + btrfs_qgroup_free_data(inode, ordered_extent->file_offset, + ordered_extent->len); btrfs_ordered_update_i_size(inode, 0, ordered_extent); if (nolock) trans = btrfs_join_transaction_nolock(root); @@ -3018,8 +3043,6 @@ static int __readpage_endio_check(struct inode *inode, char *kaddr; u32 csum_expected; u32 csum = ~(u32)0; - static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, - DEFAULT_RATELIMIT_BURST); csum_expected = *(((u32 *)io_bio->csum) + icsum); @@ -3032,9 +3055,8 @@ static int __readpage_endio_check(struct inode *inode, kunmap_atomic(kaddr); return 0; zeroit: - if (__ratelimit(&_rs)) - btrfs_warn(BTRFS_I(inode)->root->fs_info, - "csum failed ino %llu off %llu csum %u expected csum %u", + btrfs_warn_rl(BTRFS_I(inode)->root->fs_info, + "csum failed ino %llu off %llu csum %u expected csum %u", btrfs_ino(inode), start, csum, csum_expected); memset(kaddr + pgoff, 1, len); flush_dcache_page(page); @@ -4217,6 +4239,47 @@ static int truncate_space_check(struct btrfs_trans_handle *trans, } +static int truncate_inline_extent(struct inode *inode, + struct btrfs_path *path, + struct btrfs_key *found_key, + const u64 item_end, + const u64 new_size) +{ + struct extent_buffer *leaf = path->nodes[0]; + int slot = path->slots[0]; + struct btrfs_file_extent_item *fi; + u32 size = (u32)(new_size - found_key->offset); + struct btrfs_root *root = BTRFS_I(inode)->root; + + fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + + if (btrfs_file_extent_compression(leaf, fi) != BTRFS_COMPRESS_NONE) { + loff_t offset = new_size; + loff_t page_end = ALIGN(offset, PAGE_CACHE_SIZE); + + /* + * Zero out the remaining of the last page of our inline extent, + * instead of directly truncating our inline extent here - that + * would be much more complex (decompressing all the data, then + * compressing the truncated data, which might be bigger than + * the size of the inline extent, resize the extent, etc). + * We release the path because to get the page we might need to + * read the extent item from disk (data not in the page cache). + */ + btrfs_release_path(path); + return btrfs_truncate_page(inode, offset, page_end - offset, 0); + } + + btrfs_set_file_extent_ram_bytes(leaf, fi, size); + size = btrfs_file_extent_calc_inline_size(size); + btrfs_truncate_item(root, path, size, 1); + + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) + inode_sub_bytes(inode, item_end + 1 - new_size); + + return 0; +} + /* * this can truncate away extent items, csum items and directory items. * It starts at a high offset and removes keys until it can't find @@ -4411,27 +4474,40 @@ search_again: * special encodings */ if (!del_item && - btrfs_file_extent_compression(leaf, fi) == 0 && btrfs_file_extent_encryption(leaf, fi) == 0 && btrfs_file_extent_other_encoding(leaf, fi) == 0) { - u32 size = new_size - found_key.offset; - - if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) - inode_sub_bytes(inode, item_end + 1 - - new_size); /* - * update the ram bytes to properly reflect - * the new size of our item + * Need to release path in order to truncate a + * compressed extent. So delete any accumulated + * extent items so far. */ - btrfs_set_file_extent_ram_bytes(leaf, fi, size); - size = - btrfs_file_extent_calc_inline_size(size); - btrfs_truncate_item(root, path, size, 1); + if (btrfs_file_extent_compression(leaf, fi) != + BTRFS_COMPRESS_NONE && pending_del_nr) { + err = btrfs_del_items(trans, root, path, + pending_del_slot, + pending_del_nr); + if (err) { + btrfs_abort_transaction(trans, + root, + err); + goto error; + } + pending_del_nr = 0; + } + + err = truncate_inline_extent(inode, path, + &found_key, + item_end, + new_size); + if (err) { + btrfs_abort_transaction(trans, + root, err); + goto error; + } } else if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) { - inode_sub_bytes(inode, item_end + 1 - - found_key.offset); + inode_sub_bytes(inode, item_end + 1 - new_size); } } delete: @@ -4461,7 +4537,7 @@ delete: ret = btrfs_free_extent(trans, root, extent_start, extent_num_bytes, 0, btrfs_header_owner(leaf), - ino, extent_offset, 0); + ino, extent_offset); BUG_ON(ret); if (btrfs_should_throttle_delayed_refs(trans, root)) btrfs_async_run_delayed_refs(root, @@ -4575,14 +4651,17 @@ int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len, if ((offset & (blocksize - 1)) == 0 && (!len || ((len & (blocksize - 1)) == 0))) goto out; - ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); + ret = btrfs_delalloc_reserve_space(inode, + round_down(from, PAGE_CACHE_SIZE), PAGE_CACHE_SIZE); if (ret) goto out; again: page = find_or_create_page(mapping, index, mask); if (!page) { - btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); + btrfs_delalloc_release_space(inode, + round_down(from, PAGE_CACHE_SIZE), + PAGE_CACHE_SIZE); ret = -ENOMEM; goto out; } @@ -4650,7 +4729,8 @@ again: out_unlock: if (ret) - btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); + btrfs_delalloc_release_space(inode, page_start, + PAGE_CACHE_SIZE); unlock_page(page); page_cache_release(page); out: @@ -5048,6 +5128,18 @@ static void evict_inode_truncate_pages(struct inode *inode) spin_unlock(&io_tree->lock); lock_extent_bits(io_tree, start, end, 0, &cached_state); + + /* + * If still has DELALLOC flag, the extent didn't reach disk, + * and its reserved space won't be freed by delayed_ref. + * So we need to free its reserved space here. + * (Refer to comment in btrfs_invalidatepage, case 2) + * + * Note, end is the bytenr of last byte, so we need + 1 here. + */ + if (state->state & EXTENT_DELALLOC) + btrfs_qgroup_free_data(inode, start, end - start + 1); + clear_extent_bit(io_tree, start, end, EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | @@ -7581,7 +7673,7 @@ unlock: spin_unlock(&BTRFS_I(inode)->lock); } - btrfs_free_reserved_data_space(inode, len); + btrfs_free_reserved_data_space(inode, start, len); WARN_ON(dio_data->reserve < len); dio_data->reserve -= len; current->journal_info = dio_data; @@ -8371,7 +8463,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, mutex_unlock(&inode->i_mutex); relock = true; } - ret = btrfs_delalloc_reserve_space(inode, count); + ret = btrfs_delalloc_reserve_space(inode, offset, count); if (ret) goto out; dio_data.outstanding_extents = div64_u64(count + @@ -8400,10 +8492,10 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, current->journal_info = NULL; if (ret < 0 && ret != -EIOCBQUEUED) { if (dio_data.reserve) - btrfs_delalloc_release_space(inode, - dio_data.reserve); + btrfs_delalloc_release_space(inode, offset, + dio_data.reserve); } else if (ret >= 0 && (size_t)ret < count) - btrfs_delalloc_release_space(inode, + btrfs_delalloc_release_space(inode, offset, count - (size_t)ret); } out: @@ -8562,6 +8654,18 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, } } + /* + * Qgroup reserved space handler + * Page here will be either + * 1) Already written to disk + * In this case, its reserved space is released from data rsv map + * and will be freed by delayed_ref handler finally. + * So even we call qgroup_free_data(), it won't decrease reserved + * space. + * 2) Not written to disk + * This means the reserved space should be freed here. + */ + btrfs_qgroup_free_data(inode, page_start, PAGE_CACHE_SIZE); if (!inode_evicting) { clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED | EXTENT_DIRTY | @@ -8612,7 +8716,11 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) u64 page_end; sb_start_pagefault(inode->i_sb); - ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); + page_start = page_offset(page); + page_end = page_start + PAGE_CACHE_SIZE - 1; + + ret = btrfs_delalloc_reserve_space(inode, page_start, + PAGE_CACHE_SIZE); if (!ret) { ret = file_update_time(vma->vm_file); reserved = 1; @@ -8631,8 +8739,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) again: lock_page(page); size = i_size_read(inode); - page_start = page_offset(page); - page_end = page_start + PAGE_CACHE_SIZE - 1; if ((page->mapping != inode->i_mapping) || (page_start >= size)) { @@ -8709,7 +8815,7 @@ out_unlock: } unlock_page(page); out: - btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); + btrfs_delalloc_release_space(inode, page_start, PAGE_CACHE_SIZE); out_noreserve: sb_end_pagefault(inode->i_sb); return ret; @@ -8998,6 +9104,7 @@ void btrfs_destroy_inode(struct inode *inode) btrfs_put_ordered_extent(ordered); } } + btrfs_qgroup_check_reserved_leak(inode); inode_tree_del(inode); btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); free: @@ -9634,6 +9741,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, u64 cur_offset = start; u64 i_size; u64 cur_bytes; + u64 last_alloc = (u64)-1; int ret = 0; bool own_trans = true; @@ -9650,6 +9758,13 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, cur_bytes = min(num_bytes, 256ULL * 1024 * 1024); cur_bytes = max(cur_bytes, min_size); + /* + * If we are severely fragmented we could end up with really + * small allocations, so if the allocator is returning small + * chunks lets make its job easier by only searching for those + * sized chunks. + */ + cur_bytes = min(cur_bytes, last_alloc); ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0, *alloc_hint, &ins, 1, 0); if (ret) { @@ -9658,6 +9773,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, break; } + last_alloc = ins.offset; ret = insert_reserved_file_extent(trans, inode, cur_offset, ins.objectid, ins.offset, ins.offset, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 8d20f3b..da94138 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1120,7 +1120,8 @@ static int cluster_pages_for_defrag(struct inode *inode, page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1); ret = btrfs_delalloc_reserve_space(inode, - page_cnt << PAGE_CACHE_SHIFT); + start_index << PAGE_CACHE_SHIFT, + page_cnt << PAGE_CACHE_SHIFT); if (ret) return ret; i_done = 0; @@ -1210,7 +1211,8 @@ again: BTRFS_I(inode)->outstanding_extents++; spin_unlock(&BTRFS_I(inode)->lock); btrfs_delalloc_release_space(inode, - (page_cnt - i_done) << PAGE_CACHE_SHIFT); + start_index << PAGE_CACHE_SHIFT, + (page_cnt - i_done) << PAGE_CACHE_SHIFT); } @@ -1235,7 +1237,9 @@ out: unlock_page(pages[i]); page_cache_release(pages[i]); } - btrfs_delalloc_release_space(inode, page_cnt << PAGE_CACHE_SHIFT); + btrfs_delalloc_release_space(inode, + start_index << PAGE_CACHE_SHIFT, + page_cnt << PAGE_CACHE_SHIFT); return ret; } @@ -1342,7 +1346,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, break; if (btrfs_defrag_cancelled(root->fs_info)) { - printk(KERN_DEBUG "BTRFS: defrag_file cancelled\n"); + btrfs_debug(root->fs_info, "defrag_file cancelled"); ret = -EAGAIN; break; } @@ -1579,7 +1583,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, new_size = div_u64(new_size, root->sectorsize); new_size *= root->sectorsize; - printk_in_rcu(KERN_INFO "BTRFS: new size for %s is %llu\n", + btrfs_info_in_rcu(root->fs_info, "new size for %s is %llu", rcu_str_deref(device->name), new_size); if (new_size > old_size) { @@ -2081,7 +2085,7 @@ static noinline int search_ioctl(struct inode *inode, key.offset = (u64)-1; root = btrfs_read_fs_root_no_name(info, &key); if (IS_ERR(root)) { - printk(KERN_ERR "BTRFS: could not find root %llu\n", + btrfs_err(info, "could not find root %llu", sk->tree_id); btrfs_free_path(path); return -ENOENT; @@ -2221,7 +2225,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info, key.offset = (u64)-1; root = btrfs_read_fs_root_no_name(info, &key); if (IS_ERR(root)) { - printk(KERN_ERR "BTRFS: could not find root %llu\n", tree_id); + btrfs_err(info, "could not find root %llu", tree_id); ret = -ENOENT; goto out; } @@ -2699,7 +2703,6 @@ static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg) { struct btrfs_ioctl_fs_info_args *fi_args; struct btrfs_device *device; - struct btrfs_device *next; struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; int ret = 0; @@ -2711,7 +2714,7 @@ static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg) fi_args->num_devices = fs_devices->num_devices; memcpy(&fi_args->fsid, root->fs_info->fsid, sizeof(fi_args->fsid)); - list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { + list_for_each_entry(device, &fs_devices->devices, dev_list) { if (device->devid > fi_args->max_id) fi_args->max_id = device->devid; } @@ -3203,41 +3206,6 @@ out: return ret; } -/* Helper to check and see if this root currently has a ref on the given disk - * bytenr. If it does then we need to update the quota for this root. This - * doesn't do anything if quotas aren't enabled. - */ -static int check_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 disko) -{ - struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem); - struct ulist *roots; - struct ulist_iterator uiter; - struct ulist_node *root_node = NULL; - int ret; - - if (!root->fs_info->quota_enabled) - return 1; - - btrfs_get_tree_mod_seq(root->fs_info, &tree_mod_seq_elem); - ret = btrfs_find_all_roots(trans, root->fs_info, disko, - tree_mod_seq_elem.seq, &roots); - if (ret < 0) - goto out; - ret = 0; - ULIST_ITER_INIT(&uiter); - while ((root_node = ulist_next(roots, &uiter))) { - if (root_node->val == root->objectid) { - ret = 1; - break; - } - } - ulist_free(roots); -out: - btrfs_put_tree_mod_seq(root->fs_info, &tree_mod_seq_elem); - return ret; -} - static int clone_finish_inode_update(struct btrfs_trans_handle *trans, struct inode *inode, u64 endoff, @@ -3328,6 +3296,150 @@ static void clone_update_extent_map(struct inode *inode, &BTRFS_I(inode)->runtime_flags); } +/* + * Make sure we do not end up inserting an inline extent into a file that has + * already other (non-inline) extents. If a file has an inline extent it can + * not have any other extents and the (single) inline extent must start at the + * file offset 0. Failing to respect these rules will lead to file corruption, + * resulting in EIO errors on read/write operations, hitting BUG_ON's in mm, etc + * + * We can have extents that have been already written to disk or we can have + * dirty ranges still in delalloc, in which case the extent maps and items are + * created only when we run delalloc, and the delalloc ranges might fall outside + * the range we are currently locking in the inode's io tree. So we check the + * inode's i_size because of that (i_size updates are done while holding the + * i_mutex, which we are holding here). + * We also check to see if the inode has a size not greater than "datal" but has + * extents beyond it, due to an fallocate with FALLOC_FL_KEEP_SIZE (and we are + * protected against such concurrent fallocate calls by the i_mutex). + * + * If the file has no extents but a size greater than datal, do not allow the + * copy because we would need turn the inline extent into a non-inline one (even + * with NO_HOLES enabled). If we find our destination inode only has one inline + * extent, just overwrite it with the source inline extent if its size is less + * than the source extent's size, or we could copy the source inline extent's + * data into the destination inode's inline extent if the later is greater then + * the former. + */ +static int clone_copy_inline_extent(struct inode *src, + struct inode *dst, + struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_key *new_key, + const u64 drop_start, + const u64 datal, + const u64 skip, + const u64 size, + char *inline_data) +{ + struct btrfs_root *root = BTRFS_I(dst)->root; + const u64 aligned_end = ALIGN(new_key->offset + datal, + root->sectorsize); + int ret; + struct btrfs_key key; + + if (new_key->offset > 0) + return -EOPNOTSUPP; + + key.objectid = btrfs_ino(dst); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = 0; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + return ret; + } else if (ret > 0) { + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + return ret; + else if (ret > 0) + goto copy_inline_extent; + } + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid == btrfs_ino(dst) && + key.type == BTRFS_EXTENT_DATA_KEY) { + ASSERT(key.offset > 0); + return -EOPNOTSUPP; + } + } else if (i_size_read(dst) <= datal) { + struct btrfs_file_extent_item *ei; + u64 ext_len; + + /* + * If the file size is <= datal, make sure there are no other + * extents following (can happen do to an fallocate call with + * the flag FALLOC_FL_KEEP_SIZE). + */ + ei = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + /* + * If it's an inline extent, it can not have other extents + * following it. + */ + if (btrfs_file_extent_type(path->nodes[0], ei) == + BTRFS_FILE_EXTENT_INLINE) + goto copy_inline_extent; + + ext_len = btrfs_file_extent_num_bytes(path->nodes[0], ei); + if (ext_len > aligned_end) + return -EOPNOTSUPP; + + ret = btrfs_next_item(root, path); + if (ret < 0) { + return ret; + } else if (ret == 0) { + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0]); + if (key.objectid == btrfs_ino(dst) && + key.type == BTRFS_EXTENT_DATA_KEY) + return -EOPNOTSUPP; + } + } + +copy_inline_extent: + /* + * We have no extent items, or we have an extent at offset 0 which may + * or may not be inlined. All these cases are dealt the same way. + */ + if (i_size_read(dst) > datal) { + /* + * If the destination inode has an inline extent... + * This would require copying the data from the source inline + * extent into the beginning of the destination's inline extent. + * But this is really complex, both extents can be compressed + * or just one of them, which would require decompressing and + * re-compressing data (which could increase the new compressed + * size, not allowing the compressed data to fit anymore in an + * inline extent). + * So just don't support this case for now (it should be rare, + * we are not really saving space when cloning inline extents). + */ + return -EOPNOTSUPP; + } + + btrfs_release_path(path); + ret = btrfs_drop_extents(trans, root, dst, drop_start, aligned_end, 1); + if (ret) + return ret; + ret = btrfs_insert_empty_item(trans, root, path, new_key, size); + if (ret) + return ret; + + if (skip) { + const u32 start = btrfs_file_extent_calc_inline_size(0); + + memmove(inline_data + start, inline_data + start + skip, datal); + } + + write_extent_buffer(path->nodes[0], inline_data, + btrfs_item_ptr_offset(path->nodes[0], + path->slots[0]), + size); + inode_add_bytes(dst, datal); + + return 0; +} + /** * btrfs_clone() - clone a range from inode file to another * @@ -3352,9 +3464,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode, u32 nritems; int slot; int ret; - int no_quota; const u64 len = olen_aligned; - u64 last_disko = 0; u64 last_dest_end = destoff; ret = -ENOMEM; @@ -3400,7 +3510,6 @@ static int btrfs_clone(struct inode *src, struct inode *inode, nritems = btrfs_header_nritems(path->nodes[0]); process_slot: - no_quota = 1; if (path->slots[0] >= nritems) { ret = btrfs_next_leaf(BTRFS_I(src)->root, path); if (ret < 0) @@ -3552,35 +3661,13 @@ process_slot: btrfs_set_file_extent_num_bytes(leaf, extent, datal); - /* - * We need to look up the roots that point at - * this bytenr and see if the new root does. If - * it does not we need to make sure we update - * quotas appropriately. - */ - if (disko && root != BTRFS_I(src)->root && - disko != last_disko) { - no_quota = check_ref(trans, root, - disko); - if (no_quota < 0) { - btrfs_abort_transaction(trans, - root, - ret); - btrfs_end_transaction(trans, - root); - ret = no_quota; - goto out; - } - } - if (disko) { inode_add_bytes(inode, datal); ret = btrfs_inc_extent_ref(trans, root, disko, diskl, 0, root->root_key.objectid, btrfs_ino(inode), - new_key.offset - datao, - no_quota); + new_key.offset - datao); if (ret) { btrfs_abort_transaction(trans, root, @@ -3594,21 +3681,6 @@ process_slot: } else if (type == BTRFS_FILE_EXTENT_INLINE) { u64 skip = 0; u64 trim = 0; - u64 aligned_end = 0; - - /* - * Don't copy an inline extent into an offset - * greater than zero. Having an inline extent - * at such an offset results in chaos as btrfs - * isn't prepared for such cases. Just skip - * this case for the same reasons as commented - * at btrfs_ioctl_clone(). - */ - if (last_dest_end > 0) { - ret = -EOPNOTSUPP; - btrfs_end_transaction(trans, root); - goto out; - } if (off > key.offset) { skip = off - key.offset; @@ -3626,42 +3698,22 @@ process_slot: size -= skip + trim; datal -= skip + trim; - aligned_end = ALIGN(new_key.offset + datal, - root->sectorsize); - ret = btrfs_drop_extents(trans, root, inode, - drop_start, - aligned_end, - 1); + ret = clone_copy_inline_extent(src, inode, + trans, path, + &new_key, + drop_start, + datal, + skip, size, buf); if (ret) { if (ret != -EOPNOTSUPP) btrfs_abort_transaction(trans, - root, ret); - btrfs_end_transaction(trans, root); - goto out; - } - - ret = btrfs_insert_empty_item(trans, root, path, - &new_key, size); - if (ret) { - btrfs_abort_transaction(trans, root, - ret); + root, + ret); btrfs_end_transaction(trans, root); goto out; } - - if (skip) { - u32 start = - btrfs_file_extent_calc_inline_size(0); - memmove(buf+start, buf+start+skip, - datal); - } - leaf = path->nodes[0]; slot = path->slots[0]; - write_extent_buffer(leaf, buf, - btrfs_item_ptr_offset(leaf, slot), - size); - inode_add_bytes(inode, datal); } /* If we have an implicit hole (NO_HOLES feature). */ @@ -4814,7 +4866,7 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) /* update qgroup status and info */ err = btrfs_run_qgroups(trans, root->fs_info); if (err < 0) - btrfs_error(root->fs_info, ret, + btrfs_std_error(root->fs_info, ret, "failed to update qgroup status and info\n"); err = btrfs_end_transaction(trans, root); if (err && !ret) diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index d7e6baf..8077461 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -79,6 +79,9 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw) write_lock(&eb->lock); WARN_ON(atomic_read(&eb->spinning_writers)); atomic_inc(&eb->spinning_writers); + /* + * atomic_dec_and_test implies a barrier for waitqueue_active + */ if (atomic_dec_and_test(&eb->blocking_writers) && waitqueue_active(&eb->write_lock_wq)) wake_up(&eb->write_lock_wq); @@ -86,6 +89,9 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw) BUG_ON(atomic_read(&eb->blocking_readers) == 0); read_lock(&eb->lock); atomic_inc(&eb->spinning_readers); + /* + * atomic_dec_and_test implies a barrier for waitqueue_active + */ if (atomic_dec_and_test(&eb->blocking_readers) && waitqueue_active(&eb->read_lock_wq)) wake_up(&eb->read_lock_wq); @@ -229,6 +235,9 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) } btrfs_assert_tree_read_locked(eb); WARN_ON(atomic_read(&eb->blocking_readers) == 0); + /* + * atomic_dec_and_test implies a barrier for waitqueue_active + */ if (atomic_dec_and_test(&eb->blocking_readers) && waitqueue_active(&eb->read_lock_wq)) wake_up(&eb->read_lock_wq); @@ -280,6 +289,9 @@ void btrfs_tree_unlock(struct extent_buffer *eb) if (blockers) { WARN_ON(atomic_read(&eb->spinning_writers)); atomic_dec(&eb->blocking_writers); + /* + * Make sure counter is updated before we wake up waiters. + */ smp_mb(); if (waitqueue_active(&eb->write_lock_wq)) wake_up(&eb->write_lock_wq); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 52170cf..8c27292 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -345,6 +345,9 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode, if (entry->bytes_left == 0) { ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); + /* + * Implicit memory barrier after test_and_set_bit + */ if (waitqueue_active(&entry->wait)) wake_up(&entry->wait); } else { @@ -409,6 +412,9 @@ have_entry: if (entry->bytes_left == 0) { ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); + /* + * Implicit memory barrier after test_and_set_bit + */ if (waitqueue_active(&entry->wait)) wake_up(&entry->wait); } else { @@ -484,15 +490,16 @@ void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans, spin_lock_irq(&log->log_extents_lock[index]); while (!list_empty(&log->logged_list[index])) { + struct inode *inode; ordered = list_first_entry(&log->logged_list[index], struct btrfs_ordered_extent, log_list); list_del_init(&ordered->log_list); + inode = ordered->inode; spin_unlock_irq(&log->log_extents_lock[index]); if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) && !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) { - struct inode *inode = ordered->inode; u64 start = ordered->file_offset; u64 end = ordered->file_offset + ordered->len - 1; @@ -503,20 +510,25 @@ void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans, &ordered->flags)); /* - * If our ordered extent completed it means it updated the - * fs/subvol and csum trees already, so no need to make the - * current transaction's commit wait for it, as we end up - * holding memory unnecessarily and delaying the inode's iput - * until the transaction commit (we schedule an iput for the - * inode when the ordered extent's refcount drops to 0), which - * prevents it from being evictable until the transaction - * commits. + * In order to keep us from losing our ordered extent + * information when committing the transaction we have to make + * sure that any logged extents are completed when we go to + * commit the transaction. To do this we simply increase the + * current transactions pending_ordered counter and decrement it + * when the ordered extent completes. */ - if (test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) - btrfs_put_ordered_extent(ordered); - else - list_add_tail(&ordered->trans_list, &trans->ordered); - + if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { + struct btrfs_ordered_inode_tree *tree; + + tree = &BTRFS_I(inode)->ordered_tree; + spin_lock_irq(&tree->lock); + if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { + set_bit(BTRFS_ORDERED_PENDING, &ordered->flags); + atomic_inc(&trans->transaction->pending_ordered); + } + spin_unlock_irq(&tree->lock); + } + btrfs_put_ordered_extent(ordered); spin_lock_irq(&log->log_extents_lock[index]); } spin_unlock_irq(&log->log_extents_lock[index]); @@ -578,6 +590,7 @@ void btrfs_remove_ordered_extent(struct inode *inode, struct btrfs_ordered_inode_tree *tree; struct btrfs_root *root = BTRFS_I(inode)->root; struct rb_node *node; + bool dec_pending_ordered = false; tree = &BTRFS_I(inode)->ordered_tree; spin_lock_irq(&tree->lock); @@ -587,8 +600,37 @@ void btrfs_remove_ordered_extent(struct inode *inode, if (tree->last == node) tree->last = NULL; set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); + if (test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags)) + dec_pending_ordered = true; spin_unlock_irq(&tree->lock); + /* + * The current running transaction is waiting on us, we need to let it + * know that we're complete and wake it up. + */ + if (dec_pending_ordered) { + struct btrfs_transaction *trans; + + /* + * The checks for trans are just a formality, it should be set, + * but if it isn't we don't want to deref/assert under the spin + * lock, so be nice and check if trans is set, but ASSERT() so + * if it isn't set a developer will notice. + */ + spin_lock(&root->fs_info->trans_lock); + trans = root->fs_info->running_transaction; + if (trans) + atomic_inc(&trans->use_count); + spin_unlock(&root->fs_info->trans_lock); + + ASSERT(trans); + if (trans) { + if (atomic_dec_and_test(&trans->pending_ordered)) + wake_up(&trans->pending_wait); + btrfs_put_transaction(trans); + } + } + spin_lock(&root->ordered_extent_lock); list_del_init(&entry->root_extent_list); root->nr_ordered_extents--; diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 7176cc0..23c9605 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -73,6 +73,8 @@ struct btrfs_ordered_sum { #define BTRFS_ORDERED_LOGGED 10 /* Set when we've waited on this ordered extent * in the logging code. */ +#define BTRFS_ORDERED_PENDING 11 /* We are waiting for this ordered extent to + * complete in the current transaction. */ struct btrfs_ordered_extent { /* logical offset in the file */ u64 file_offset; diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index dca137b..f9e6023 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -49,18 +49,16 @@ static struct prop_handler prop_handlers[] = { .extract = prop_compression_extract, .inheritable = 1 }, - { - .xattr_name = NULL - } }; void __init btrfs_props_init(void) { - struct prop_handler *p; + int i; hash_init(prop_handlers_ht); - for (p = &prop_handlers[0]; p->xattr_name; p++) { + for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) { + struct prop_handler *p = &prop_handlers[i]; u64 h = btrfs_name_hash(p->xattr_name, strlen(p->xattr_name)); hash_add(prop_handlers_ht, &p->node, h); @@ -301,15 +299,16 @@ static int inherit_props(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *parent) { - const struct prop_handler *h; struct btrfs_root *root = BTRFS_I(inode)->root; int ret; + int i; if (!test_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(parent)->runtime_flags)) return 0; - for (h = &prop_handlers[0]; h->xattr_name; h++) { + for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) { + const struct prop_handler *h = &prop_handlers[i]; const char *value; u64 num_bytes; diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index d904ee1..46476c2 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1652,10 +1652,6 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info, } } - /* For exclusive extent, free its reserved bytes too */ - if (nr_old_roots == 0 && nr_new_roots == 1 && - cur_new_count == nr_new_roots) - qg->reserved -= num_bytes; if (dirty) qgroup_dirty(fs_info, qg); } @@ -2035,7 +2031,7 @@ out: return ret; } -int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes) +static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes) { struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; @@ -2116,14 +2112,13 @@ out: return ret; } -void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes) +void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, + u64 ref_root, u64 num_bytes) { struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; - struct btrfs_fs_info *fs_info = root->fs_info; struct ulist_node *unode; struct ulist_iterator uiter; - u64 ref_root = root->root_key.objectid; int ret = 0; if (!is_fstree(ref_root)) @@ -2169,6 +2164,11 @@ out: spin_unlock(&fs_info->qgroup_lock); } +static inline void qgroup_free(struct btrfs_root *root, u64 num_bytes) +{ + return btrfs_qgroup_free_refroot(root->fs_info, root->objectid, + num_bytes); +} void assert_qgroups_uptodate(struct btrfs_trans_handle *trans) { if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq) @@ -2188,10 +2188,10 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans) */ static int qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, - struct btrfs_trans_handle *trans, - struct extent_buffer *scratch_leaf) + struct btrfs_trans_handle *trans) { struct btrfs_key found; + struct extent_buffer *scratch_leaf = NULL; struct ulist *roots = NULL; struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem); u64 num_bytes; @@ -2229,7 +2229,15 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, fs_info->qgroup_rescan_progress.objectid = found.objectid + 1; btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem); - memcpy(scratch_leaf, path->nodes[0], sizeof(*scratch_leaf)); + scratch_leaf = btrfs_clone_extent_buffer(path->nodes[0]); + if (!scratch_leaf) { + ret = -ENOMEM; + mutex_unlock(&fs_info->qgroup_rescan_lock); + goto out; + } + extent_buffer_get(scratch_leaf); + btrfs_tree_read_lock(scratch_leaf); + btrfs_set_lock_blocking_rw(scratch_leaf, BTRFS_READ_LOCK); slot = path->slots[0]; btrfs_release_path(path); mutex_unlock(&fs_info->qgroup_rescan_lock); @@ -2255,6 +2263,10 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, goto out; } out: + if (scratch_leaf) { + btrfs_tree_read_unlock_blocking(scratch_leaf); + free_extent_buffer(scratch_leaf); + } btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); return ret; @@ -2266,16 +2278,12 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) qgroup_rescan_work); struct btrfs_path *path; struct btrfs_trans_handle *trans = NULL; - struct extent_buffer *scratch_leaf = NULL; int err = -ENOMEM; int ret = 0; path = btrfs_alloc_path(); if (!path) goto out; - scratch_leaf = kmalloc(sizeof(*scratch_leaf), GFP_NOFS); - if (!scratch_leaf) - goto out; err = 0; while (!err) { @@ -2287,8 +2295,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) if (!fs_info->quota_enabled) { err = -EINTR; } else { - err = qgroup_rescan_leaf(fs_info, path, trans, - scratch_leaf); + err = qgroup_rescan_leaf(fs_info, path, trans); } if (err > 0) btrfs_commit_transaction(trans, fs_info->fs_root); @@ -2297,7 +2304,6 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) } out: - kfree(scratch_leaf); btrfs_free_path(path); mutex_lock(&fs_info->qgroup_rescan_lock); @@ -2486,3 +2492,190 @@ btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info) btrfs_queue_work(fs_info->qgroup_rescan_workers, &fs_info->qgroup_rescan_work); } + +/* + * Reserve qgroup space for range [start, start + len). + * + * This function will either reserve space from related qgroups or doing + * nothing if the range is already reserved. + * + * Return 0 for successful reserve + * Return <0 for error (including -EQUOT) + * + * NOTE: this function may sleep for memory allocation. + */ +int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_changeset changeset; + struct ulist_node *unode; + struct ulist_iterator uiter; + int ret; + + if (!root->fs_info->quota_enabled || !is_fstree(root->objectid) || + len == 0) + return 0; + + changeset.bytes_changed = 0; + changeset.range_changed = ulist_alloc(GFP_NOFS); + ret = set_record_extent_bits(&BTRFS_I(inode)->io_tree, start, + start + len -1, EXTENT_QGROUP_RESERVED, GFP_NOFS, + &changeset); + trace_btrfs_qgroup_reserve_data(inode, start, len, + changeset.bytes_changed, + QGROUP_RESERVE); + if (ret < 0) + goto cleanup; + ret = qgroup_reserve(root, changeset.bytes_changed); + if (ret < 0) + goto cleanup; + + ulist_free(changeset.range_changed); + return ret; + +cleanup: + /* cleanup already reserved ranges */ + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(changeset.range_changed, &uiter))) + clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val, + unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL, + GFP_NOFS); + ulist_free(changeset.range_changed); + return ret; +} + +static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len, + int free) +{ + struct extent_changeset changeset; + int trace_op = QGROUP_RELEASE; + int ret; + + changeset.bytes_changed = 0; + changeset.range_changed = ulist_alloc(GFP_NOFS); + if (!changeset.range_changed) + return -ENOMEM; + + ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start, + start + len -1, EXTENT_QGROUP_RESERVED, GFP_NOFS, + &changeset); + if (ret < 0) + goto out; + + if (free) { + qgroup_free(BTRFS_I(inode)->root, changeset.bytes_changed); + trace_op = QGROUP_FREE; + } + trace_btrfs_qgroup_release_data(inode, start, len, + changeset.bytes_changed, trace_op); +out: + ulist_free(changeset.range_changed); + return ret; +} + +/* + * Free a reserved space range from io_tree and related qgroups + * + * Should be called when a range of pages get invalidated before reaching disk. + * Or for error cleanup case. + * + * For data written to disk, use btrfs_qgroup_release_data(). + * + * NOTE: This function may sleep for memory allocation. + */ +int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len) +{ + return __btrfs_qgroup_release_data(inode, start, len, 1); +} + +/* + * Release a reserved space range from io_tree only. + * + * Should be called when a range of pages get written to disk and corresponding + * FILE_EXTENT is inserted into corresponding root. + * + * Since new qgroup accounting framework will only update qgroup numbers at + * commit_transaction() time, its reserved space shouldn't be freed from + * related qgroups. + * + * But we should release the range from io_tree, to allow further write to be + * COWed. + * + * NOTE: This function may sleep for memory allocation. + */ +int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len) +{ + return __btrfs_qgroup_release_data(inode, start, len, 0); +} + +int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes) +{ + int ret; + + if (!root->fs_info->quota_enabled || !is_fstree(root->objectid) || + num_bytes == 0) + return 0; + + BUG_ON(num_bytes != round_down(num_bytes, root->nodesize)); + ret = qgroup_reserve(root, num_bytes); + if (ret < 0) + return ret; + atomic_add(num_bytes, &root->qgroup_meta_rsv); + return ret; +} + +void btrfs_qgroup_free_meta_all(struct btrfs_root *root) +{ + int reserved; + + if (!root->fs_info->quota_enabled || !is_fstree(root->objectid)) + return; + + reserved = atomic_xchg(&root->qgroup_meta_rsv, 0); + if (reserved == 0) + return; + qgroup_free(root, reserved); +} + +void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes) +{ + if (!root->fs_info->quota_enabled || !is_fstree(root->objectid)) + return; + + BUG_ON(num_bytes != round_down(num_bytes, root->nodesize)); + WARN_ON(atomic_read(&root->qgroup_meta_rsv) < num_bytes); + atomic_sub(num_bytes, &root->qgroup_meta_rsv); + qgroup_free(root, num_bytes); +} + +/* + * Check qgroup reserved space leaking, normally at destory inode + * time + */ +void btrfs_qgroup_check_reserved_leak(struct inode *inode) +{ + struct extent_changeset changeset; + struct ulist_node *unode; + struct ulist_iterator iter; + int ret; + + changeset.bytes_changed = 0; + changeset.range_changed = ulist_alloc(GFP_NOFS); + if (WARN_ON(!changeset.range_changed)) + return; + + ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1, + EXTENT_QGROUP_RESERVED, GFP_NOFS, &changeset); + + WARN_ON(ret < 0); + if (WARN_ON(changeset.bytes_changed)) { + ULIST_ITER_INIT(&iter); + while ((unode = ulist_next(changeset.range_changed, &iter))) { + btrfs_warn(BTRFS_I(inode)->root->fs_info, + "leaking qgroup reserved space, ino: %lu, start: %llu, end: %llu", + inode->i_ino, unode->val, unode->aux); + } + qgroup_free(BTRFS_I(inode)->root, changeset.bytes_changed); + } + ulist_free(changeset.range_changed); +} diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 6387dcf..ecb2c14 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -33,6 +33,13 @@ struct btrfs_qgroup_extent_record { struct ulist *old_roots; }; +/* + * For qgroup event trace points only + */ +#define QGROUP_RESERVE (1<<0) +#define QGROUP_RELEASE (1<<1) +#define QGROUP_FREE (1<<2) + int btrfs_quota_enable(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_quota_disable(struct btrfs_trans_handle *trans, @@ -71,9 +78,18 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans, int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid, struct btrfs_qgroup_inherit *inherit); -int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes); -void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes); - +void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, + u64 ref_root, u64 num_bytes); +/* + * TODO: Add proper trace point for it, as btrfs_qgroup_free() is + * called by everywhere, can't provide good trace for delayed ref case. + */ +static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info, + u64 ref_root, u64 num_bytes) +{ + btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes); + trace_btrfs_qgroup_free_delayed_ref(ref_root, num_bytes); +} void assert_qgroups_uptodate(struct btrfs_trans_handle *trans); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS @@ -81,4 +97,13 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, u64 rfer, u64 excl); #endif +/* New io_tree based accurate qgroup reserve API */ +int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len); +int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len); +int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len); + +int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes); +void btrfs_qgroup_free_meta_all(struct btrfs_root *root); +void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes); +void btrfs_qgroup_check_reserved_leak(struct inode *inode); #endif /* __BTRFS_QGROUP__ */ diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index fcf7265..1a33d3e 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -810,7 +810,11 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) } goto done_nolock; - } else if (waitqueue_active(&h->wait)) { + /* + * The barrier for this waitqueue_active is not needed, + * we're protected by h->lock and can't miss a wakeup. + */ + } else if (waitqueue_active(&h->wait)) { spin_unlock(&rbio->bio_list_lock); spin_unlock_irqrestore(&h->lock, flags); wake_up(&h->wait); diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 4645cd1..619f929 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -569,7 +569,7 @@ static int reada_add_block(struct reada_control *rc, u64 logical, rec = kzalloc(sizeof(*rec), GFP_NOFS); if (!rec) { reada_extent_put(root->fs_info, re); - return -1; + return -ENOMEM; } rec->rc = rc; @@ -918,6 +918,7 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root, u64 start; u64 generation; int level; + int ret; struct extent_buffer *node; static struct btrfs_key max_key = { .objectid = (u64)-1, @@ -943,9 +944,10 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root, generation = btrfs_header_generation(node); free_extent_buffer(node); - if (reada_add_block(rc, start, &max_key, level, generation)) { + ret = reada_add_block(rc, start, &max_key, level, generation); + if (ret) { kfree(rc); - return ERR_PTR(-ENOMEM); + return ERR_PTR(ret); } reada_start_machine(root->fs_info); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 303babe..b4ca545 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1716,7 +1716,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, ret = btrfs_inc_extent_ref(trans, root, new_bytenr, num_bytes, parent, btrfs_header_owner(leaf), - key.objectid, key.offset, 1); + key.objectid, key.offset); if (ret) { btrfs_abort_transaction(trans, root, ret); break; @@ -1724,7 +1724,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, ret = btrfs_free_extent(trans, root, bytenr, num_bytes, parent, btrfs_header_owner(leaf), - key.objectid, key.offset, 1); + key.objectid, key.offset); if (ret) { btrfs_abort_transaction(trans, root, ret); break; @@ -1900,23 +1900,21 @@ again: ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize, path->nodes[level]->start, - src->root_key.objectid, level - 1, 0, - 1); + src->root_key.objectid, level - 1, 0); BUG_ON(ret); ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize, 0, dest->root_key.objectid, level - 1, - 0, 1); + 0); BUG_ON(ret); ret = btrfs_free_extent(trans, src, new_bytenr, blocksize, path->nodes[level]->start, - src->root_key.objectid, level - 1, 0, - 1); + src->root_key.objectid, level - 1, 0); BUG_ON(ret); ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize, 0, dest->root_key.objectid, level - 1, - 0, 1); + 0); BUG_ON(ret); btrfs_unlock_up_safe(path, 0); @@ -2418,7 +2416,7 @@ again: } out: if (ret) { - btrfs_std_error(root->fs_info, ret); + btrfs_std_error(root->fs_info, ret, NULL); if (!list_empty(&reloc_roots)) free_reloc_roots(&reloc_roots); @@ -2745,7 +2743,7 @@ static int do_relocation(struct btrfs_trans_handle *trans, node->eb->start, blocksize, upper->eb->start, btrfs_header_owner(upper->eb), - node->level, 0, 1); + node->level, 0); BUG_ON(ret); ret = btrfs_drop_subtree(trans, root, eb, upper->eb); @@ -3034,8 +3032,8 @@ int prealloc_file_extent_cluster(struct inode *inode, BUG_ON(cluster->start != cluster->boundary[0]); mutex_lock(&inode->i_mutex); - ret = btrfs_check_data_free_space(inode, cluster->end + - 1 - cluster->start, 0); + ret = btrfs_check_data_free_space(inode, cluster->start, + cluster->end + 1 - cluster->start); if (ret) goto out; @@ -3056,8 +3054,8 @@ int prealloc_file_extent_cluster(struct inode *inode, break; nr++; } - btrfs_free_reserved_data_space(inode, cluster->end + - 1 - cluster->start); + btrfs_free_reserved_data_space(inode, cluster->start, + cluster->end + 1 - cluster->start); out: mutex_unlock(&inode->i_mutex); return ret; diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 360a728..7cf8509 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -45,12 +45,13 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot, if (!need_reset && btrfs_root_generation(item) != btrfs_root_generation_v2(item)) { if (btrfs_root_generation_v2(item) != 0) { - printk(KERN_WARNING "BTRFS: mismatching " + btrfs_warn(eb->fs_info, + "mismatching " "generation and generation_v2 " "found in root item. This root " "was probably mounted with an " "older kernel. Resetting all " - "new fields.\n"); + "new fields."); } need_reset = 1; } @@ -141,7 +142,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root int ret; int slot; unsigned long ptr; - int old_len; + u32 old_len; path = btrfs_alloc_path(); if (!path) @@ -283,7 +284,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) trans = btrfs_join_transaction(tree_root); if (IS_ERR(trans)) { err = PTR_ERR(trans); - btrfs_error(tree_root->fs_info, err, + btrfs_std_error(tree_root->fs_info, err, "Failed to start trans to delete " "orphan item"); break; @@ -292,7 +293,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) root_key.objectid); btrfs_end_transaction(trans, tree_root); if (err) { - btrfs_error(tree_root->fs_info, err, + btrfs_std_error(tree_root->fs_info, err, "Failed to delete root orphan " "item"); break; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index a39f5d1..550de89 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -580,9 +580,9 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, * hold all of the paths here */ for (i = 0; i < ipath->fspath->elem_cnt; ++i) - printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev " + btrfs_warn_in_rcu(fs_info, "%s at logical %llu on dev " "%s, sector %llu, root %llu, inode %llu, offset %llu, " - "length %llu, links %u (path: %s)\n", swarn->errstr, + "length %llu, links %u (path: %s)", swarn->errstr, swarn->logical, rcu_str_deref(swarn->dev->name), (unsigned long long)swarn->sector, root, inum, offset, min(isize - offset, (u64)PAGE_SIZE), nlink, @@ -592,9 +592,9 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, return 0; err: - printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev " + btrfs_warn_in_rcu(fs_info, "%s at logical %llu on dev " "%s, sector %llu, root %llu, inode %llu, offset %llu: path " - "resolving failed with ret=%d\n", swarn->errstr, + "resolving failed with ret=%d", swarn->errstr, swarn->logical, rcu_str_deref(swarn->dev->name), (unsigned long long)swarn->sector, root, inum, offset, ret); @@ -649,10 +649,10 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) ret = tree_backref_for_extent(&ptr, eb, &found_key, ei, item_size, &ref_root, &ref_level); - printk_in_rcu(KERN_WARNING - "BTRFS: %s at logical %llu on dev %s, " + btrfs_warn_in_rcu(fs_info, + "%s at logical %llu on dev %s, " "sector %llu: metadata %s (level %d) in tree " - "%llu\n", errstr, swarn.logical, + "%llu", errstr, swarn.logical, rcu_str_deref(dev->name), (unsigned long long)swarn.sector, ref_level ? "node" : "leaf", @@ -850,8 +850,8 @@ out: btrfs_dev_replace_stats_inc( &sctx->dev_root->fs_info->dev_replace. num_uncorrectable_read_errors); - printk_ratelimited_in_rcu(KERN_ERR "BTRFS: " - "unable to fixup (nodatasum) error at logical %llu on dev %s\n", + btrfs_err_rl_in_rcu(sctx->dev_root->fs_info, + "unable to fixup (nodatasum) error at logical %llu on dev %s", fixup->logical, rcu_str_deref(fixup->dev->name)); } @@ -1230,8 +1230,8 @@ corrected_error: sctx->stat.corrected_errors++; sblock_to_check->data_corrected = 1; spin_unlock(&sctx->stat_lock); - printk_ratelimited_in_rcu(KERN_ERR - "BTRFS: fixed up error at logical %llu on dev %s\n", + btrfs_err_rl_in_rcu(fs_info, + "fixed up error at logical %llu on dev %s", logical, rcu_str_deref(dev->name)); } } else { @@ -1239,8 +1239,8 @@ did_not_correct_error: spin_lock(&sctx->stat_lock); sctx->stat.uncorrectable_errors++; spin_unlock(&sctx->stat_lock); - printk_ratelimited_in_rcu(KERN_ERR - "BTRFS: unable to fixup (regular) error at logical %llu on dev %s\n", + btrfs_err_rl_in_rcu(fs_info, + "unable to fixup (regular) error at logical %llu on dev %s", logical, rcu_str_deref(dev->name)); } @@ -1626,9 +1626,9 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, int ret; if (!page_bad->dev->bdev) { - printk_ratelimited(KERN_WARNING "BTRFS: " + btrfs_warn_rl(sblock_bad->sctx->dev_root->fs_info, "scrub_repair_page_from_good_copy(bdev == NULL) " - "is unexpected!\n"); + "is unexpected"); return -EIO; } @@ -2201,15 +2201,15 @@ static void scrub_missing_raid56_worker(struct btrfs_work *work) spin_lock(&sctx->stat_lock); sctx->stat.read_errors++; spin_unlock(&sctx->stat_lock); - printk_ratelimited_in_rcu(KERN_ERR - "BTRFS: I/O error rebulding logical %llu for dev %s\n", + btrfs_err_rl_in_rcu(fs_info, + "IO error rebuilding logical %llu for dev %s", logical, rcu_str_deref(dev->name)); } else if (sblock->header_error || sblock->checksum_error) { spin_lock(&sctx->stat_lock); sctx->stat.uncorrectable_errors++; spin_unlock(&sctx->stat_lock); - printk_ratelimited_in_rcu(KERN_ERR - "BTRFS: failed to rebuild valid logical %llu for dev %s\n", + btrfs_err_rl_in_rcu(fs_info, + "failed to rebuild valid logical %llu for dev %s", logical, rcu_str_deref(dev->name)); } else { scrub_write_block_to_dev_replace(sblock); @@ -4375,8 +4375,8 @@ static int write_page_nocow(struct scrub_ctx *sctx, if (!dev) return -EIO; if (!dev->bdev) { - printk_ratelimited(KERN_WARNING - "BTRFS: scrub write_page_nocow(bdev == NULL) is unexpected!\n"); + btrfs_warn_rl(dev->dev_root->fs_info, + "scrub write_page_nocow(bdev == NULL) is unexpected"); return -EIO; } bio = btrfs_io_bio_alloc(GFP_NOFS, 1); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index a739b82..355a458 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1434,16 +1434,6 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, " } if (cur_clone_root) { - if (compressed != BTRFS_COMPRESS_NONE) { - /* - * Offsets given by iterate_extent_inodes() are relative - * to the start of the extent, we need to add logical - * offset from the file extent item. - * (See why at backref.c:check_extent_in_eb()) - */ - cur_clone_root->offset += btrfs_file_extent_offset(eb, - fi); - } *found = cur_clone_root; ret = 0; } else { @@ -2353,8 +2343,14 @@ static int send_subvol_begin(struct send_ctx *sctx) } TLV_PUT_STRING(sctx, BTRFS_SEND_A_PATH, name, namelen); - TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID, - sctx->send_root->root_item.uuid); + + if (!btrfs_is_empty_uuid(sctx->send_root->root_item.received_uuid)) + TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID, + sctx->send_root->root_item.received_uuid); + else + TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID, + sctx->send_root->root_item.uuid); + TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID, le64_to_cpu(sctx->send_root->root_item.ctransid)); if (parent_root) { @@ -2564,7 +2560,7 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino); } else if (S_ISSOCK(mode)) { cmd = BTRFS_SEND_C_MKSOCK; } else { - printk(KERN_WARNING "btrfs: unexpected inode type %o", + btrfs_warn(sctx->send_root->fs_info, "unexpected inode type %o", (int)(mode & S_IFMT)); ret = -ENOTSUPP; goto out; @@ -4687,6 +4683,171 @@ tlv_put_failure: return ret; } +static int send_extent_data(struct send_ctx *sctx, + const u64 offset, + const u64 len) +{ + u64 sent = 0; + + if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) + return send_update_extent(sctx, offset, len); + + while (sent < len) { + u64 size = len - sent; + int ret; + + if (size > BTRFS_SEND_READ_SIZE) + size = BTRFS_SEND_READ_SIZE; + ret = send_write(sctx, offset + sent, size); + if (ret < 0) + return ret; + if (!ret) + break; + sent += ret; + } + return 0; +} + +static int clone_range(struct send_ctx *sctx, + struct clone_root *clone_root, + const u64 disk_byte, + u64 data_offset, + u64 offset, + u64 len) +{ + struct btrfs_path *path; + struct btrfs_key key; + int ret; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + /* + * We can't send a clone operation for the entire range if we find + * extent items in the respective range in the source file that + * refer to different extents or if we find holes. + * So check for that and do a mix of clone and regular write/copy + * operations if needed. + * + * Example: + * + * mkfs.btrfs -f /dev/sda + * mount /dev/sda /mnt + * xfs_io -f -c "pwrite -S 0xaa 0K 100K" /mnt/foo + * cp --reflink=always /mnt/foo /mnt/bar + * xfs_io -c "pwrite -S 0xbb 50K 50K" /mnt/foo + * btrfs subvolume snapshot -r /mnt /mnt/snap + * + * If when we send the snapshot and we are processing file bar (which + * has a higher inode number than foo) we blindly send a clone operation + * for the [0, 100K[ range from foo to bar, the receiver ends up getting + * a file bar that matches the content of file foo - iow, doesn't match + * the content from bar in the original filesystem. + */ + key.objectid = clone_root->ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = clone_root->offset; + ret = btrfs_search_slot(NULL, clone_root->root, &key, path, 0, 0); + if (ret < 0) + goto out; + if (ret > 0 && path->slots[0] > 0) { + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1); + if (key.objectid == clone_root->ino && + key.type == BTRFS_EXTENT_DATA_KEY) + path->slots[0]--; + } + + while (true) { + struct extent_buffer *leaf = path->nodes[0]; + int slot = path->slots[0]; + struct btrfs_file_extent_item *ei; + u8 type; + u64 ext_len; + u64 clone_len; + + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(clone_root->root, path); + if (ret < 0) + goto out; + else if (ret > 0) + break; + continue; + } + + btrfs_item_key_to_cpu(leaf, &key, slot); + + /* + * We might have an implicit trailing hole (NO_HOLES feature + * enabled). We deal with it after leaving this loop. + */ + if (key.objectid != clone_root->ino || + key.type != BTRFS_EXTENT_DATA_KEY) + break; + + ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + type = btrfs_file_extent_type(leaf, ei); + if (type == BTRFS_FILE_EXTENT_INLINE) { + ext_len = btrfs_file_extent_inline_len(leaf, slot, ei); + ext_len = PAGE_CACHE_ALIGN(ext_len); + } else { + ext_len = btrfs_file_extent_num_bytes(leaf, ei); + } + + if (key.offset + ext_len <= clone_root->offset) + goto next; + + if (key.offset > clone_root->offset) { + /* Implicit hole, NO_HOLES feature enabled. */ + u64 hole_len = key.offset - clone_root->offset; + + if (hole_len > len) + hole_len = len; + ret = send_extent_data(sctx, offset, hole_len); + if (ret < 0) + goto out; + + len -= hole_len; + if (len == 0) + break; + offset += hole_len; + clone_root->offset += hole_len; + data_offset += hole_len; + } + + if (key.offset >= clone_root->offset + len) + break; + + clone_len = min_t(u64, ext_len, len); + + if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte && + btrfs_file_extent_offset(leaf, ei) == data_offset) + ret = send_clone(sctx, offset, clone_len, clone_root); + else + ret = send_extent_data(sctx, offset, clone_len); + + if (ret < 0) + goto out; + + len -= clone_len; + if (len == 0) + break; + offset += clone_len; + clone_root->offset += clone_len; + data_offset += clone_len; +next: + path->slots[0]++; + } + + if (len > 0) + ret = send_extent_data(sctx, offset, len); + else + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + static int send_write_or_clone(struct send_ctx *sctx, struct btrfs_path *path, struct btrfs_key *key, @@ -4695,9 +4856,7 @@ static int send_write_or_clone(struct send_ctx *sctx, int ret = 0; struct btrfs_file_extent_item *ei; u64 offset = key->offset; - u64 pos = 0; u64 len; - u32 l; u8 type; u64 bs = sctx->send_root->fs_info->sb->s_blocksize; @@ -4725,22 +4884,15 @@ static int send_write_or_clone(struct send_ctx *sctx, } if (clone_root && IS_ALIGNED(offset + len, bs)) { - ret = send_clone(sctx, offset, len, clone_root); - } else if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) { - ret = send_update_extent(sctx, offset, len); + u64 disk_byte; + u64 data_offset; + + disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei); + data_offset = btrfs_file_extent_offset(path->nodes[0], ei); + ret = clone_range(sctx, clone_root, disk_byte, data_offset, + offset, len); } else { - while (pos < len) { - l = len - pos; - if (l > BTRFS_SEND_READ_SIZE) - l = BTRFS_SEND_READ_SIZE; - ret = send_write(sctx, pos + offset, l); - if (ret < 0) - goto out; - if (!ret) - break; - pos += ret; - } - ret = 0; + ret = send_extent_data(sctx, offset, len); } out: return ret; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 11d1eab..24154e4 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -130,7 +130,6 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info) } } -#ifdef CONFIG_PRINTK /* * __btrfs_std_error decodes expected errors from the caller and * invokes the approciate error response. @@ -140,7 +139,9 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...) { struct super_block *sb = fs_info->sb; +#ifdef CONFIG_PRINTK const char *errstr; +#endif /* * Special case: if the error is EROFS, and we're already @@ -149,6 +150,7 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, if (errno == -EROFS && (sb->s_flags & MS_RDONLY)) return; +#ifdef CONFIG_PRINTK errstr = btrfs_decode_error(errno); if (fmt) { struct va_format vaf; @@ -166,6 +168,7 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, printk(KERN_CRIT "BTRFS: error (device %s) in %s:%d: errno=%d %s\n", sb->s_id, function, line, errno, errstr); } +#endif /* Don't go through full error handling during mount */ save_error_info(fs_info); @@ -173,6 +176,7 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, btrfs_handle_error(fs_info); } +#ifdef CONFIG_PRINTK static const char * const logtypes[] = { "emergency", "alert", @@ -212,27 +216,6 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) va_end(args); } - -#else - -void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, - unsigned int line, int errno, const char *fmt, ...) -{ - struct super_block *sb = fs_info->sb; - - /* - * Special case: if the error is EROFS, and we're already - * under MS_RDONLY, then it is safe here. - */ - if (errno == -EROFS && (sb->s_flags & MS_RDONLY)) - return; - - /* Don't go through full error handling during mount */ - if (sb->s_flags & MS_BORN) { - save_error_info(fs_info); - btrfs_handle_error(fs_info); - } -} #endif /* @@ -320,6 +303,9 @@ enum { Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard, Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow, Opt_datasum, Opt_treelog, Opt_noinode_cache, +#ifdef CONFIG_BTRFS_DEBUG + Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all, +#endif Opt_err, }; @@ -372,6 +358,11 @@ static match_table_t tokens = { {Opt_rescan_uuid_tree, "rescan_uuid_tree"}, {Opt_fatal_errors, "fatal_errors=%s"}, {Opt_commit_interval, "commit=%d"}, +#ifdef CONFIG_BTRFS_DEBUG + {Opt_fragment_data, "fragment=data"}, + {Opt_fragment_metadata, "fragment=metadata"}, + {Opt_fragment_all, "fragment=all"}, +#endif {Opt_err, NULL}, }; @@ -738,6 +729,22 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; } break; +#ifdef CONFIG_BTRFS_DEBUG + case Opt_fragment_all: + btrfs_info(root->fs_info, "fragmenting all space"); + btrfs_set_opt(info->mount_opt, FRAGMENT_DATA); + btrfs_set_opt(info->mount_opt, FRAGMENT_METADATA); + break; + case Opt_fragment_metadata: + btrfs_info(root->fs_info, "fragmenting metadata"); + btrfs_set_opt(info->mount_opt, + FRAGMENT_METADATA); + break; + case Opt_fragment_data: + btrfs_info(root->fs_info, "fragmenting data"); + btrfs_set_opt(info->mount_opt, FRAGMENT_DATA); + break; +#endif case Opt_err: btrfs_info(root->fs_info, "unrecognized mount option '%s'", p); ret = -EINVAL; @@ -1189,6 +1196,12 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",fatal_errors=panic"); if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL) seq_printf(seq, ",commit=%d", info->commit_interval); +#ifdef CONFIG_BTRFS_DEBUG + if (btrfs_test_opt(root, FRAGMENT_DATA)) + seq_puts(seq, ",fragment=data"); + if (btrfs_test_opt(root, FRAGMENT_METADATA)) + seq_puts(seq, ",fragment=metadata"); +#endif seq_printf(seq, ",subvolid=%llu", BTRFS_I(d_inode(dentry))->root->root_key.objectid); seq_puts(seq, ",subvol="); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 603b0cc..e0ac859 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -437,24 +437,24 @@ static const struct attribute *btrfs_attrs[] = { NULL, }; -static void btrfs_release_super_kobj(struct kobject *kobj) +static void btrfs_release_fsid_kobj(struct kobject *kobj) { struct btrfs_fs_devices *fs_devs = to_fs_devs(kobj); - memset(&fs_devs->super_kobj, 0, sizeof(struct kobject)); + memset(&fs_devs->fsid_kobj, 0, sizeof(struct kobject)); complete(&fs_devs->kobj_unregister); } static struct kobj_type btrfs_ktype = { .sysfs_ops = &kobj_sysfs_ops, - .release = btrfs_release_super_kobj, + .release = btrfs_release_fsid_kobj, }; static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj) { if (kobj->ktype != &btrfs_ktype) return NULL; - return container_of(kobj, struct btrfs_fs_devices, super_kobj); + return container_of(kobj, struct btrfs_fs_devices, fsid_kobj); } static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj) @@ -502,12 +502,12 @@ static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add) attrs[0] = &fa->kobj_attr.attr; if (add) { int ret; - ret = sysfs_merge_group(&fs_info->fs_devices->super_kobj, + ret = sysfs_merge_group(&fs_info->fs_devices->fsid_kobj, &agroup); if (ret) return ret; } else - sysfs_unmerge_group(&fs_info->fs_devices->super_kobj, + sysfs_unmerge_group(&fs_info->fs_devices->fsid_kobj, &agroup); } @@ -523,9 +523,9 @@ static void __btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs) fs_devs->device_dir_kobj = NULL; } - if (fs_devs->super_kobj.state_initialized) { - kobject_del(&fs_devs->super_kobj); - kobject_put(&fs_devs->super_kobj); + if (fs_devs->fsid_kobj.state_initialized) { + kobject_del(&fs_devs->fsid_kobj); + kobject_put(&fs_devs->fsid_kobj); wait_for_completion(&fs_devs->kobj_unregister); } } @@ -545,7 +545,7 @@ void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs) } } -void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info) +void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info) { btrfs_reset_fs_info_ptr(fs_info); @@ -555,9 +555,9 @@ void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info) kobject_put(fs_info->space_info_kobj); } addrm_unknown_feature_attrs(fs_info, false); - sysfs_remove_group(&fs_info->fs_devices->super_kobj, &btrfs_feature_attr_group); - sysfs_remove_files(&fs_info->fs_devices->super_kobj, btrfs_attrs); - btrfs_kobj_rm_device(fs_info->fs_devices, NULL); + sysfs_remove_group(&fs_info->fs_devices->fsid_kobj, &btrfs_feature_attr_group); + sysfs_remove_files(&fs_info->fs_devices->fsid_kobj, btrfs_attrs); + btrfs_sysfs_rm_device_link(fs_info->fs_devices, NULL); } const char * const btrfs_feature_set_names[3] = { @@ -637,7 +637,7 @@ static void init_feature_attrs(void) /* when one_device is NULL, it removes all device links */ -int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices, +int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device) { struct hd_struct *disk; @@ -675,7 +675,7 @@ int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs) { if (!fs_devs->device_dir_kobj) fs_devs->device_dir_kobj = kobject_create_and_add("devices", - &fs_devs->super_kobj); + &fs_devs->fsid_kobj); if (!fs_devs->device_dir_kobj) return -ENOMEM; @@ -683,7 +683,7 @@ int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs) return 0; } -int btrfs_kobj_add_device(struct btrfs_fs_devices *fs_devices, +int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device) { int error = 0; @@ -730,31 +730,31 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs, int error; init_completion(&fs_devs->kobj_unregister); - fs_devs->super_kobj.kset = btrfs_kset; - error = kobject_init_and_add(&fs_devs->super_kobj, + fs_devs->fsid_kobj.kset = btrfs_kset; + error = kobject_init_and_add(&fs_devs->fsid_kobj, &btrfs_ktype, parent, "%pU", fs_devs->fsid); return error; } -int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info) +int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info) { int error; struct btrfs_fs_devices *fs_devs = fs_info->fs_devices; - struct kobject *super_kobj = &fs_devs->super_kobj; + struct kobject *fsid_kobj = &fs_devs->fsid_kobj; btrfs_set_fs_info_ptr(fs_info); - error = btrfs_kobj_add_device(fs_devs, NULL); + error = btrfs_sysfs_add_device_link(fs_devs, NULL); if (error) return error; - error = sysfs_create_files(super_kobj, btrfs_attrs); + error = sysfs_create_files(fsid_kobj, btrfs_attrs); if (error) { - btrfs_kobj_rm_device(fs_devs, NULL); + btrfs_sysfs_rm_device_link(fs_devs, NULL); return error; } - error = sysfs_create_group(super_kobj, + error = sysfs_create_group(fsid_kobj, &btrfs_feature_attr_group); if (error) goto failure; @@ -764,7 +764,7 @@ int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info) goto failure; fs_info->space_info_kobj = kobject_create_and_add("allocation", - super_kobj); + fsid_kobj); if (!fs_info->space_info_kobj) { error = -ENOMEM; goto failure; @@ -776,7 +776,7 @@ int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info) return 0; failure: - btrfs_sysfs_remove_one(fs_info); + btrfs_sysfs_remove_mounted(fs_info); return error; } diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index 6392527..9c09522 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -82,9 +82,9 @@ char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags); extern const char * const btrfs_feature_set_names[3]; extern struct kobj_type space_info_ktype; extern struct kobj_type btrfs_raid_ktype; -int btrfs_kobj_add_device(struct btrfs_fs_devices *fs_devices, +int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device); -int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices, +int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device); int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs, struct kobject *parent); diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c index 2299bfd..c8c3d70 100644 --- a/fs/btrfs/tests/free-space-tests.c +++ b/fs/btrfs/tests/free-space-tests.c @@ -19,6 +19,7 @@ #include <linux/slab.h> #include "btrfs-tests.h" #include "../ctree.h" +#include "../disk-io.h" #include "../free-space-cache.h" #define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) @@ -35,6 +36,12 @@ static struct btrfs_block_group_cache *init_test_block_group(void) kfree(cache); return NULL; } + cache->fs_info = btrfs_alloc_dummy_fs_info(); + if (!cache->fs_info) { + kfree(cache->free_space_ctl); + kfree(cache); + return NULL; + } cache->key.objectid = 0; cache->key.offset = 1024 * 1024 * 1024; @@ -879,7 +886,8 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) int btrfs_test_free_space_cache(void) { struct btrfs_block_group_cache *cache; - int ret; + struct btrfs_root *root = NULL; + int ret = -ENOMEM; test_msg("Running btrfs free space cache tests\n"); @@ -889,6 +897,17 @@ int btrfs_test_free_space_cache(void) return 0; } + root = btrfs_alloc_dummy_root(); + if (!root) + goto out; + + root->fs_info = btrfs_alloc_dummy_fs_info(); + if (!root->fs_info) + goto out; + + root->fs_info->extent_root = root; + cache->fs_info = root->fs_info; + ret = test_extents(cache); if (ret) goto out; @@ -904,6 +923,7 @@ out: __btrfs_remove_free_space_cache(cache->free_space_ctl); kfree(cache->free_space_ctl); kfree(cache); + btrfs_free_dummy_root(root); test_msg("Free space cache tests finished\n"); return ret; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index a5b0644..418c6a2 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -82,6 +82,12 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) static void clear_btree_io_tree(struct extent_io_tree *tree) { spin_lock(&tree->lock); + /* + * Do a single barrier for the waitqueue_active check here, the state + * of the waitqueue should not change once clear_btree_io_tree is + * called. + */ + smp_mb(); while (!RB_EMPTY_ROOT(&tree->state)) { struct rb_node *node; struct extent_state *state; @@ -226,25 +232,22 @@ loop: extwriter_counter_init(cur_trans, type); init_waitqueue_head(&cur_trans->writer_wait); init_waitqueue_head(&cur_trans->commit_wait); + init_waitqueue_head(&cur_trans->pending_wait); cur_trans->state = TRANS_STATE_RUNNING; /* * One for this trans handle, one so it will live on until we * commit the transaction. */ atomic_set(&cur_trans->use_count, 2); - cur_trans->have_free_bgs = 0; + atomic_set(&cur_trans->pending_ordered, 0); + cur_trans->flags = 0; cur_trans->start_time = get_seconds(); - cur_trans->dirty_bg_run = 0; + + memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs)); cur_trans->delayed_refs.href_root = RB_ROOT; cur_trans->delayed_refs.dirty_extent_root = RB_ROOT; atomic_set(&cur_trans->delayed_refs.num_entries, 0); - cur_trans->delayed_refs.num_heads_ready = 0; - cur_trans->delayed_refs.pending_csums = 0; - cur_trans->delayed_refs.num_heads = 0; - cur_trans->delayed_refs.flushing = 0; - cur_trans->delayed_refs.run_delayed_start = 0; - cur_trans->delayed_refs.qgroup_to_skip = 0; /* * although the tree mod log is per file system and not per transaction, @@ -264,7 +267,6 @@ loop: INIT_LIST_HEAD(&cur_trans->pending_snapshots); INIT_LIST_HEAD(&cur_trans->pending_chunks); INIT_LIST_HEAD(&cur_trans->switch_commits); - INIT_LIST_HEAD(&cur_trans->pending_ordered); INIT_LIST_HEAD(&cur_trans->dirty_bgs); INIT_LIST_HEAD(&cur_trans->io_bgs); INIT_LIST_HEAD(&cur_trans->dropped_roots); @@ -447,8 +449,8 @@ static inline bool need_reserve_reloc_root(struct btrfs_root *root) } static struct btrfs_trans_handle * -start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type, - enum btrfs_reserve_flush_enum flush) +start_transaction(struct btrfs_root *root, unsigned int num_items, + unsigned int type, enum btrfs_reserve_flush_enum flush) { struct btrfs_trans_handle *h; struct btrfs_transaction *cur_trans; @@ -478,13 +480,10 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type, * the appropriate flushing if need be. */ if (num_items > 0 && root != root->fs_info->chunk_root) { - if (root->fs_info->quota_enabled && - is_fstree(root->root_key.objectid)) { - qgroup_reserved = num_items * root->nodesize; - ret = btrfs_qgroup_reserve(root, qgroup_reserved); - if (ret) - return ERR_PTR(ret); - } + qgroup_reserved = num_items * root->nodesize; + ret = btrfs_qgroup_reserve_meta(root, qgroup_reserved); + if (ret) + return ERR_PTR(ret); num_bytes = btrfs_calc_trans_metadata_size(root, num_items); /* @@ -502,7 +501,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type, goto reserve_fail; } again: - h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); + h = kmem_cache_zalloc(btrfs_trans_handle_cachep, GFP_NOFS); if (!h) { ret = -ENOMEM; goto alloc_fail; @@ -543,26 +542,13 @@ again: h->transid = cur_trans->transid; h->transaction = cur_trans; - h->blocks_used = 0; - h->bytes_reserved = 0; - h->chunk_bytes_reserved = 0; h->root = root; - h->delayed_ref_updates = 0; h->use_count = 1; - h->adding_csums = 0; - h->block_rsv = NULL; - h->orig_rsv = NULL; - h->aborted = 0; - h->qgroup_reserved = 0; - h->delayed_ref_elem.seq = 0; + h->type = type; - h->allocating_chunk = false; h->can_flush_pending_bgs = true; - h->reloc_reserved = false; - h->sync = false; INIT_LIST_HEAD(&h->qgroup_ref_list); INIT_LIST_HEAD(&h->new_bgs); - INIT_LIST_HEAD(&h->ordered); smp_mb(); if (cur_trans->state >= TRANS_STATE_BLOCKED && @@ -579,7 +565,6 @@ again: h->bytes_reserved = num_bytes; h->reloc_reserved = reloc_reserved; } - h->qgroup_reserved = qgroup_reserved; got_it: btrfs_record_root_in_trans(h, root); @@ -597,20 +582,20 @@ alloc_fail: btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv, num_bytes); reserve_fail: - if (qgroup_reserved) - btrfs_qgroup_free(root, qgroup_reserved); + btrfs_qgroup_free_meta(root, qgroup_reserved); return ERR_PTR(ret); } struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, - int num_items) + unsigned int num_items) { return start_transaction(root, num_items, TRANS_START, BTRFS_RESERVE_FLUSH_ALL); } struct btrfs_trans_handle *btrfs_start_transaction_lflush( - struct btrfs_root *root, int num_items) + struct btrfs_root *root, + unsigned int num_items) { return start_transaction(root, num_items, TRANS_START, BTRFS_RESERVE_FLUSH_LIMIT); @@ -794,12 +779,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, if (!list_empty(&trans->new_bgs)) btrfs_create_pending_block_groups(trans, root); - if (!list_empty(&trans->ordered)) { - spin_lock(&info->trans_lock); - list_splice_init(&trans->ordered, &cur_trans->pending_ordered); - spin_unlock(&info->trans_lock); - } - trans->delayed_ref_updates = 0; if (!trans->sync) { must_run_delayed_refs = @@ -815,15 +794,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, must_run_delayed_refs = 2; } - if (trans->qgroup_reserved) { - /* - * the same root has to be passed here between start_transaction - * and end_transaction. Subvolume quota depends on this. - */ - btrfs_qgroup_free(trans->root, trans->qgroup_reserved); - trans->qgroup_reserved = 0; - } - btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL; @@ -856,6 +826,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, atomic_dec(&cur_trans->num_writers); extwriter_counter_dec(cur_trans, trans->type); + /* + * Make sure counter is updated before we wake up waiters. + */ smp_mb(); if (waitqueue_active(&cur_trans->writer_wait)) wake_up(&cur_trans->writer_wait); @@ -1238,6 +1211,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, spin_lock(&fs_info->fs_roots_radix_lock); if (err) break; + btrfs_qgroup_free_meta_all(root); } } spin_unlock(&fs_info->fs_roots_radix_lock); @@ -1795,25 +1769,10 @@ static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) } static inline void -btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans, - struct btrfs_fs_info *fs_info) +btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans) { - struct btrfs_ordered_extent *ordered; - - spin_lock(&fs_info->trans_lock); - while (!list_empty(&cur_trans->pending_ordered)) { - ordered = list_first_entry(&cur_trans->pending_ordered, - struct btrfs_ordered_extent, - trans_list); - list_del_init(&ordered->trans_list); - spin_unlock(&fs_info->trans_lock); - - wait_event(ordered->wait, test_bit(BTRFS_ORDERED_COMPLETE, - &ordered->flags)); - btrfs_put_ordered_extent(ordered); - spin_lock(&fs_info->trans_lock); - } - spin_unlock(&fs_info->trans_lock); + wait_event(cur_trans->pending_wait, + atomic_read(&cur_trans->pending_ordered) == 0); } int btrfs_commit_transaction(struct btrfs_trans_handle *trans, @@ -1842,10 +1801,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL; - if (trans->qgroup_reserved) { - btrfs_qgroup_free(root, trans->qgroup_reserved); - trans->qgroup_reserved = 0; - } cur_trans = trans->transaction; @@ -1865,7 +1820,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, return ret; } - if (!cur_trans->dirty_bg_run) { + if (!test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &cur_trans->flags)) { int run_it = 0; /* this mutex is also taken before trying to set @@ -1874,18 +1829,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, * after a extents from that block group have been * allocated for cache files. btrfs_set_block_group_ro * will wait for the transaction to commit if it - * finds dirty_bg_run = 1 + * finds BTRFS_TRANS_DIRTY_BG_RUN set. * - * The dirty_bg_run flag is also used to make sure only - * one process starts all the block group IO. It wouldn't + * The BTRFS_TRANS_DIRTY_BG_RUN flag is also used to make sure + * only one process starts all the block group IO. It wouldn't * hurt to have more than one go through, but there's no * real advantage to it either. */ mutex_lock(&root->fs_info->ro_block_group_mutex); - if (!cur_trans->dirty_bg_run) { + if (!test_and_set_bit(BTRFS_TRANS_DIRTY_BG_RUN, + &cur_trans->flags)) run_it = 1; - cur_trans->dirty_bg_run = 1; - } mutex_unlock(&root->fs_info->ro_block_group_mutex); if (run_it) @@ -1897,7 +1851,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, } spin_lock(&root->fs_info->trans_lock); - list_splice_init(&trans->ordered, &cur_trans->pending_ordered); if (cur_trans->state >= TRANS_STATE_COMMIT_START) { spin_unlock(&root->fs_info->trans_lock); atomic_inc(&cur_trans->use_count); @@ -1956,7 +1909,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_wait_delalloc_flush(root->fs_info); - btrfs_wait_pending_ordered(cur_trans, root->fs_info); + btrfs_wait_pending_ordered(cur_trans); btrfs_scrub_pause(root); /* @@ -2136,7 +2089,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, ret = btrfs_write_and_wait_transaction(trans, root); if (ret) { - btrfs_error(root->fs_info, ret, + btrfs_std_error(root->fs_info, ret, "Error while writing out transaction"); mutex_unlock(&root->fs_info->tree_log_mutex); goto scrub_continue; @@ -2156,7 +2109,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_finish_extent_commit(trans, root); - if (cur_trans->have_free_bgs) + if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags)) btrfs_clear_space_info_full(root->fs_info); root->fs_info->last_trans_committed = cur_trans->transid; @@ -2198,10 +2151,6 @@ cleanup_transaction: btrfs_trans_release_metadata(trans, root); btrfs_trans_release_chunk_metadata(trans); trans->block_rsv = NULL; - if (trans->qgroup_reserved) { - btrfs_qgroup_free(root, trans->qgroup_reserved); - trans->qgroup_reserved = 0; - } btrfs_warn(root->fs_info, "Skipping commit of aborted transaction."); if (current->journal_info == trans) current->journal_info = NULL; diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index a994bb0..b05b2f6 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -32,6 +32,10 @@ enum btrfs_trans_state { TRANS_STATE_MAX = 6, }; +#define BTRFS_TRANS_HAVE_FREE_BGS 0 +#define BTRFS_TRANS_DIRTY_BG_RUN 1 +#define BTRFS_TRANS_CACHE_ENOSPC 2 + struct btrfs_transaction { u64 transid; /* @@ -46,11 +50,9 @@ struct btrfs_transaction { */ atomic_t num_writers; atomic_t use_count; + atomic_t pending_ordered; - /* - * true if there is free bgs operations in this transaction - */ - int have_free_bgs; + unsigned long flags; /* Be protected by fs_info->trans_lock when we want to change it. */ enum btrfs_trans_state state; @@ -59,9 +61,9 @@ struct btrfs_transaction { unsigned long start_time; wait_queue_head_t writer_wait; wait_queue_head_t commit_wait; + wait_queue_head_t pending_wait; struct list_head pending_snapshots; struct list_head pending_chunks; - struct list_head pending_ordered; struct list_head switch_commits; struct list_head dirty_bgs; struct list_head io_bgs; @@ -80,7 +82,6 @@ struct btrfs_transaction { spinlock_t dropped_roots_lock; struct btrfs_delayed_ref_root delayed_refs; int aborted; - int dirty_bg_run; }; #define __TRANS_FREEZABLE (1U << 0) @@ -107,7 +108,6 @@ struct btrfs_trans_handle { u64 transid; u64 bytes_reserved; u64 chunk_bytes_reserved; - u64 qgroup_reserved; unsigned long use_count; unsigned long blocks_reserved; unsigned long blocks_used; @@ -129,7 +129,6 @@ struct btrfs_trans_handle { */ struct btrfs_root *root; struct seq_list delayed_ref_elem; - struct list_head ordered; struct list_head qgroup_ref_list; struct list_head new_bgs; }; @@ -185,9 +184,10 @@ static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans) int btrfs_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root); struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, - int num_items); + unsigned int num_items); struct btrfs_trans_handle *btrfs_start_transaction_lflush( - struct btrfs_root *root, int num_items); + struct btrfs_root *root, + unsigned int num_items); struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root); struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root); struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 1bbaace..323e12c 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -229,7 +229,9 @@ int btrfs_pin_log_trans(struct btrfs_root *root) void btrfs_end_log_trans(struct btrfs_root *root) { if (atomic_dec_and_test(&root->log_writers)) { - smp_mb(); + /* + * Implicit memory barrier after atomic_dec_and_test + */ if (waitqueue_active(&root->log_writer_wait)) wake_up(&root->log_writer_wait); } @@ -691,7 +693,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, ret = btrfs_inc_extent_ref(trans, root, ins.objectid, ins.offset, 0, root->root_key.objectid, - key->objectid, offset, 0); + key->objectid, offset); if (ret) goto out; } else { @@ -2820,7 +2822,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, mutex_lock(&log_root_tree->log_mutex); if (atomic_dec_and_test(&log_root_tree->log_writers)) { - smp_mb(); + /* + * Implicit memory barrier after atomic_dec_and_test + */ if (waitqueue_active(&log_root_tree->log_writer_wait)) wake_up(&log_root_tree->log_writer_wait); } @@ -2950,6 +2954,9 @@ out_wake_log_root: atomic_set(&log_root_tree->log_commit[index2], 0); mutex_unlock(&log_root_tree->log_mutex); + /* + * The barrier before waitqueue_active is implied by mutex_unlock + */ if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) wake_up(&log_root_tree->log_commit_wait[index2]); out: @@ -2961,6 +2968,9 @@ out: atomic_set(&root->log_commit[index1], 0); mutex_unlock(&root->log_mutex); + /* + * The barrier before waitqueue_active is implied by mutex_unlock + */ if (waitqueue_active(&root->log_commit_wait[index1])) wake_up(&root->log_commit_wait[index1]); return ret; @@ -5314,7 +5324,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) ret = walk_log_tree(trans, log_root_tree, &wc); if (ret) { - btrfs_error(fs_info, ret, "Failed to pin buffers while " + btrfs_std_error(fs_info, ret, "Failed to pin buffers while " "recovering log root tree."); goto error; } @@ -5328,7 +5338,7 @@ again: ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); if (ret < 0) { - btrfs_error(fs_info, ret, + btrfs_std_error(fs_info, ret, "Couldn't find tree log root."); goto error; } @@ -5346,7 +5356,7 @@ again: log = btrfs_read_fs_root(log_root_tree, &found_key); if (IS_ERR(log)) { ret = PTR_ERR(log); - btrfs_error(fs_info, ret, + btrfs_std_error(fs_info, ret, "Couldn't read tree log root."); goto error; } @@ -5361,7 +5371,7 @@ again: free_extent_buffer(log->node); free_extent_buffer(log->commit_root); kfree(log); - btrfs_error(fs_info, ret, "Couldn't read target root " + btrfs_std_error(fs_info, ret, "Couldn't read target root " "for tree log recovery."); goto error; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 6fc73586..17ed76d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -42,6 +42,82 @@ #include "dev-replace.h" #include "sysfs.h" +const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { + [BTRFS_RAID_RAID10] = { + .sub_stripes = 2, + .dev_stripes = 1, + .devs_max = 0, /* 0 == as many as possible */ + .devs_min = 4, + .tolerated_failures = 1, + .devs_increment = 2, + .ncopies = 2, + }, + [BTRFS_RAID_RAID1] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 2, + .devs_min = 2, + .tolerated_failures = 1, + .devs_increment = 2, + .ncopies = 2, + }, + [BTRFS_RAID_DUP] = { + .sub_stripes = 1, + .dev_stripes = 2, + .devs_max = 1, + .devs_min = 1, + .tolerated_failures = 0, + .devs_increment = 1, + .ncopies = 2, + }, + [BTRFS_RAID_RAID0] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 0, + .devs_min = 2, + .tolerated_failures = 0, + .devs_increment = 1, + .ncopies = 1, + }, + [BTRFS_RAID_SINGLE] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 1, + .devs_min = 1, + .tolerated_failures = 0, + .devs_increment = 1, + .ncopies = 1, + }, + [BTRFS_RAID_RAID5] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 0, + .devs_min = 2, + .tolerated_failures = 1, + .devs_increment = 1, + .ncopies = 2, + }, + [BTRFS_RAID_RAID6] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 0, + .devs_min = 3, + .tolerated_failures = 2, + .devs_increment = 1, + .ncopies = 3, + }, +}; + +const u64 const btrfs_raid_group[BTRFS_NR_RAID_TYPES] = { + [BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10, + [BTRFS_RAID_RAID1] = BTRFS_BLOCK_GROUP_RAID1, + [BTRFS_RAID_DUP] = BTRFS_BLOCK_GROUP_DUP, + [BTRFS_RAID_RAID0] = BTRFS_BLOCK_GROUP_RAID0, + [BTRFS_RAID_SINGLE] = 0, + [BTRFS_RAID_RAID5] = BTRFS_BLOCK_GROUP_RAID5, + [BTRFS_RAID_RAID6] = BTRFS_BLOCK_GROUP_RAID6, +}; + static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_device *device); @@ -198,7 +274,6 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, if (IS_ERR(*bdev)) { ret = PTR_ERR(*bdev); - printk(KERN_INFO "BTRFS: open %s failed\n", device_path); goto error; } @@ -211,8 +286,8 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, } invalidate_bdev(*bdev); *bh = btrfs_read_dev_super(*bdev); - if (!*bh) { - ret = -EINVAL; + if (IS_ERR(*bh)) { + ret = PTR_ERR(*bh); blkdev_put(*bdev, flags); goto error; } @@ -345,6 +420,9 @@ loop_lock: pending = pending->bi_next; cur->bi_next = NULL; + /* + * atomic_dec_return implies a barrier for waitqueue_active + */ if (atomic_dec_return(&fs_info->nr_async_bios) < limit && waitqueue_active(&fs_info->async_submit_wait)) wake_up(&fs_info->async_submit_wait); @@ -765,36 +843,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) { - struct btrfs_device *new_device; - struct rcu_string *name; - - if (device->bdev) - fs_devices->open_devices--; - - if (device->writeable && - device->devid != BTRFS_DEV_REPLACE_DEVID) { - list_del_init(&device->dev_alloc_list); - fs_devices->rw_devices--; - } - - if (device->missing) - fs_devices->missing_devices--; - - new_device = btrfs_alloc_device(NULL, &device->devid, - device->uuid); - BUG_ON(IS_ERR(new_device)); /* -ENOMEM */ - - /* Safe because we are under uuid_mutex */ - if (device->name) { - name = rcu_string_strdup(device->name->str, GFP_NOFS); - BUG_ON(!name); /* -ENOMEM */ - rcu_assign_pointer(new_device->name, name); - } - - list_replace_rcu(&device->dev_list, &new_device->dev_list); - new_device->fs_devices = device->fs_devices; - - call_rcu(&device->rcu, free_device); + btrfs_close_one_device(device); } mutex_unlock(&fs_devices->device_list_mutex); @@ -1402,7 +1451,7 @@ again: extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); } else { - btrfs_error(root->fs_info, ret, "Slot search failed"); + btrfs_std_error(root->fs_info, ret, "Slot search failed"); goto out; } @@ -1410,10 +1459,10 @@ again: ret = btrfs_del_item(trans, root, path); if (ret) { - btrfs_error(root->fs_info, ret, + btrfs_std_error(root->fs_info, ret, "Failed to remove dev extent item"); } else { - trans->transaction->have_free_bgs = 1; + set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags); } out: btrfs_free_path(path); @@ -1801,7 +1850,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) if (device->bdev) { device->fs_devices->open_devices--; /* remove sysfs entry */ - btrfs_kobj_rm_device(root->fs_info->fs_devices, device); + btrfs_sysfs_rm_device_link(root->fs_info->fs_devices, device); } call_rcu(&device->rcu, free_device); @@ -1924,7 +1973,8 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info, if (srcdev->writeable) { fs_devices->rw_devices--; /* zero out the old super if it is writable */ - btrfs_scratch_superblock(srcdev); + btrfs_scratch_superblocks(srcdev->bdev, + rcu_str_deref(srcdev->name)); } if (srcdev->bdev) @@ -1971,10 +2021,11 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, WARN_ON(!tgtdev); mutex_lock(&fs_info->fs_devices->device_list_mutex); - btrfs_kobj_rm_device(fs_info->fs_devices, tgtdev); + btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev); if (tgtdev->bdev) { - btrfs_scratch_superblock(tgtdev); + btrfs_scratch_superblocks(tgtdev->bdev, + rcu_str_deref(tgtdev->name)); fs_info->fs_devices->open_devices--; } fs_info->fs_devices->num_devices--; @@ -2041,10 +2092,8 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, } } - if (!*device) { - btrfs_err(root->fs_info, "no missing device found"); - return -ENOENT; - } + if (!*device) + return BTRFS_ERROR_DEV_MISSING_NOT_FOUND; return 0; } else { @@ -2309,7 +2358,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) tmp + 1); /* add sysfs device entry */ - btrfs_kobj_add_device(root->fs_info->fs_devices, device); + btrfs_sysfs_add_device_link(root->fs_info->fs_devices, device); /* * we've got more storage, clear any full flags on the space @@ -2350,9 +2399,10 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) */ snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", root->fs_info->fsid); - if (kobject_rename(&root->fs_info->fs_devices->super_kobj, + if (kobject_rename(&root->fs_info->fs_devices->fsid_kobj, fsid_buf)) - pr_warn("BTRFS: sysfs: failed to create fsid for sprout\n"); + btrfs_warn(root->fs_info, + "sysfs: failed to create fsid for sprout"); } root->fs_info->num_tolerated_disk_barrier_failures = @@ -2368,7 +2418,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) ret = btrfs_relocate_sys_chunks(root); if (ret < 0) - btrfs_error(root->fs_info, ret, + btrfs_std_error(root->fs_info, ret, "Failed to relocate sys chunks after " "device initialization. This can be fixed " "using the \"btrfs balance\" command."); @@ -2388,7 +2438,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) error_trans: btrfs_end_transaction(trans, root); rcu_string_free(device->name); - btrfs_kobj_rm_device(root->fs_info->fs_devices, device); + btrfs_sysfs_rm_device_link(root->fs_info->fs_devices, device); kfree(device); error: blkdev_put(bdev, FMODE_EXCL); @@ -2613,7 +2663,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, if (ret < 0) goto out; else if (ret > 0) { /* Logic error or corruption */ - btrfs_error(root->fs_info, -ENOENT, + btrfs_std_error(root->fs_info, -ENOENT, "Failed lookup while freeing chunk."); ret = -ENOENT; goto out; @@ -2621,7 +2671,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, ret = btrfs_del_item(trans, root, path); if (ret < 0) - btrfs_error(root->fs_info, ret, + btrfs_std_error(root->fs_info, ret, "Failed to delete chunk item."); out: btrfs_free_path(path); @@ -2806,7 +2856,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, u64 chunk_offset) trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - btrfs_std_error(root->fs_info, ret); + btrfs_std_error(root->fs_info, ret, NULL); return ret; } @@ -3009,16 +3059,19 @@ static void update_balance_args(struct btrfs_balance_control *bctl) * (albeit full) chunks. */ if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) && + !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) { bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE; bctl->data.usage = 90; } if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) && + !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) { bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE; bctl->sys.usage = 90; } if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) && + !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) { bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE; bctl->meta.usage = 90; @@ -3074,13 +3127,46 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, struct btrfs_balance_args *bargs) { struct btrfs_block_group_cache *cache; + u64 chunk_used; + u64 user_thresh_min; + u64 user_thresh_max; + int ret = 1; + + cache = btrfs_lookup_block_group(fs_info, chunk_offset); + chunk_used = btrfs_block_group_used(&cache->item); + + if (bargs->usage_min == 0) + user_thresh_min = 0; + else + user_thresh_min = div_factor_fine(cache->key.offset, + bargs->usage_min); + + if (bargs->usage_max == 0) + user_thresh_max = 1; + else if (bargs->usage_max > 100) + user_thresh_max = cache->key.offset; + else + user_thresh_max = div_factor_fine(cache->key.offset, + bargs->usage_max); + + if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max) + ret = 0; + + btrfs_put_block_group(cache); + return ret; +} + +static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, + u64 chunk_offset, struct btrfs_balance_args *bargs) +{ + struct btrfs_block_group_cache *cache; u64 chunk_used, user_thresh; int ret = 1; cache = btrfs_lookup_block_group(fs_info, chunk_offset); chunk_used = btrfs_block_group_used(&cache->item); - if (bargs->usage == 0) + if (bargs->usage_min == 0) user_thresh = 1; else if (bargs->usage > 100) user_thresh = cache->key.offset; @@ -3170,6 +3256,19 @@ static int chunk_vrange_filter(struct extent_buffer *leaf, return 1; } +static int chunk_stripes_range_filter(struct extent_buffer *leaf, + struct btrfs_chunk *chunk, + struct btrfs_balance_args *bargs) +{ + int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); + + if (bargs->stripes_min <= num_stripes + && num_stripes <= bargs->stripes_max) + return 0; + + return 1; +} + static int chunk_soft_convert_filter(u64 chunk_type, struct btrfs_balance_args *bargs) { @@ -3216,6 +3315,9 @@ static int should_balance_chunk(struct btrfs_root *root, if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) && chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) { return 0; + } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && + chunk_usage_range_filter(bctl->fs_info, chunk_offset, bargs)) { + return 0; } /* devid filter */ @@ -3236,6 +3338,12 @@ static int should_balance_chunk(struct btrfs_root *root, return 0; } + /* stripes filter */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) && + chunk_stripes_range_filter(leaf, chunk, bargs)) { + return 0; + } + /* soft profile changing mode */ if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) && chunk_soft_convert_filter(chunk_type, bargs)) { @@ -3250,6 +3358,16 @@ static int should_balance_chunk(struct btrfs_root *root, return 0; else bargs->limit--; + } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) { + /* + * Same logic as the 'limit' filter; the minimum cannot be + * determined here because we do not have the global informatoin + * about the count of all chunks that satisfy the filters. + */ + if (bargs->limit_max == 0) + return 0; + else + bargs->limit_max--; } return 1; @@ -3264,6 +3382,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) struct btrfs_device *device; u64 old_size; u64 size_to_free; + u64 chunk_type; struct btrfs_chunk *chunk; struct btrfs_path *path; struct btrfs_key key; @@ -3274,9 +3393,13 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) int ret; int enospc_errors = 0; bool counting = true; + /* The single value limit and min/max limits use the same bytes in the */ u64 limit_data = bctl->data.limit; u64 limit_meta = bctl->meta.limit; u64 limit_sys = bctl->sys.limit; + u32 count_data = 0; + u32 count_meta = 0; + u32 count_sys = 0; /* step one make some room on all the devices */ devices = &fs_info->fs_devices->devices; @@ -3317,6 +3440,10 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) spin_unlock(&fs_info->balance_lock); again: if (!counting) { + /* + * The single value limit and min/max limits use the same bytes + * in the + */ bctl->data.limit = limit_data; bctl->meta.limit = limit_meta; bctl->sys.limit = limit_sys; @@ -3364,6 +3491,7 @@ again: } chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); + chunk_type = btrfs_chunk_type(leaf, chunk); if (!counting) { spin_lock(&fs_info->balance_lock); @@ -3384,6 +3512,28 @@ again: spin_lock(&fs_info->balance_lock); bctl->stat.expected++; spin_unlock(&fs_info->balance_lock); + + if (chunk_type & BTRFS_BLOCK_GROUP_DATA) + count_data++; + else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) + count_sys++; + else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) + count_meta++; + + goto loop; + } + + /* + * Apply limit_min filter, no need to check if the LIMITS + * filter is used, limit_min is 0 by default + */ + if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) && + count_data < bctl->data.limit_min) + || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) && + count_meta < bctl->meta.limit_min) + || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) && + count_sys < bctl->sys.limit_min)) { + mutex_unlock(&fs_info->delete_unused_bgs_mutex); goto loop; } @@ -3461,11 +3611,20 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info) unset_balance_control(fs_info); ret = del_balance_item(fs_info->tree_root); if (ret) - btrfs_std_error(fs_info, ret); + btrfs_std_error(fs_info, ret, NULL); atomic_set(&fs_info->mutually_exclusive_operation_running, 0); } +/* Non-zero return value signifies invalidity */ +static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg, + u64 allowed) +{ + return ((bctl_arg->flags & BTRFS_BALANCE_ARGS_CONVERT) && + (!alloc_profile_is_valid(bctl_arg->target, 1) || + (bctl_arg->target & ~allowed))); +} + /* * Should be called with both balance and volume mutexes held */ @@ -3523,27 +3682,21 @@ int btrfs_balance(struct btrfs_balance_control *bctl, if (num_devices > 3) allowed |= (BTRFS_BLOCK_GROUP_RAID10 | BTRFS_BLOCK_GROUP_RAID6); - if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && - (!alloc_profile_is_valid(bctl->data.target, 1) || - (bctl->data.target & ~allowed))) { + if (validate_convert_profile(&bctl->data, allowed)) { btrfs_err(fs_info, "unable to start balance with target " "data profile %llu", bctl->data.target); ret = -EINVAL; goto out; } - if ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && - (!alloc_profile_is_valid(bctl->meta.target, 1) || - (bctl->meta.target & ~allowed))) { + if (validate_convert_profile(&bctl->meta, allowed)) { btrfs_err(fs_info, "unable to start balance with target metadata profile %llu", bctl->meta.target); ret = -EINVAL; goto out; } - if ((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && - (!alloc_profile_is_valid(bctl->sys.target, 1) || - (bctl->sys.target & ~allowed))) { + if (validate_convert_profile(&bctl->sys, allowed)) { btrfs_err(fs_info, "unable to start balance with target system profile %llu", bctl->sys.target); @@ -4285,65 +4438,6 @@ static int btrfs_cmp_device_info(const void *a, const void *b) return 0; } -static const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { - [BTRFS_RAID_RAID10] = { - .sub_stripes = 2, - .dev_stripes = 1, - .devs_max = 0, /* 0 == as many as possible */ - .devs_min = 4, - .devs_increment = 2, - .ncopies = 2, - }, - [BTRFS_RAID_RAID1] = { - .sub_stripes = 1, - .dev_stripes = 1, - .devs_max = 2, - .devs_min = 2, - .devs_increment = 2, - .ncopies = 2, - }, - [BTRFS_RAID_DUP] = { - .sub_stripes = 1, - .dev_stripes = 2, - .devs_max = 1, - .devs_min = 1, - .devs_increment = 1, - .ncopies = 2, - }, - [BTRFS_RAID_RAID0] = { - .sub_stripes = 1, - .dev_stripes = 1, - .devs_max = 0, - .devs_min = 2, - .devs_increment = 1, - .ncopies = 1, - }, - [BTRFS_RAID_SINGLE] = { - .sub_stripes = 1, - .dev_stripes = 1, - .devs_max = 1, - .devs_min = 1, - .devs_increment = 1, - .ncopies = 1, - }, - [BTRFS_RAID_RAID5] = { - .sub_stripes = 1, - .dev_stripes = 1, - .devs_max = 0, - .devs_min = 2, - .devs_increment = 1, - .ncopies = 2, - }, - [BTRFS_RAID_RAID6] = { - .sub_stripes = 1, - .dev_stripes = 1, - .devs_max = 0, - .devs_min = 3, - .devs_increment = 1, - .ncopies = 3, - }, -}; - static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target) { /* TODO allow them to set a preferred stripe size */ @@ -6594,8 +6688,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, BUG_ON(!path); ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); if (ret < 0) { - printk_in_rcu(KERN_WARNING "BTRFS: " - "error %d while searching for dev_stats item for device %s!\n", + btrfs_warn_in_rcu(dev_root->fs_info, + "error %d while searching for dev_stats item for device %s", ret, rcu_str_deref(device->name)); goto out; } @@ -6605,8 +6699,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, /* need to delete old one and insert a new one */ ret = btrfs_del_item(trans, dev_root, path); if (ret != 0) { - printk_in_rcu(KERN_WARNING "BTRFS: " - "delete too small dev_stats item for device %s failed %d!\n", + btrfs_warn_in_rcu(dev_root->fs_info, + "delete too small dev_stats item for device %s failed %d", rcu_str_deref(device->name), ret); goto out; } @@ -6619,9 +6713,9 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_item(trans, dev_root, path, &key, sizeof(*ptr)); if (ret < 0) { - printk_in_rcu(KERN_WARNING "BTRFS: " - "insert dev_stats item for device %s failed %d!\n", - rcu_str_deref(device->name), ret); + btrfs_warn_in_rcu(dev_root->fs_info, + "insert dev_stats item for device %s failed %d", + rcu_str_deref(device->name), ret); goto out; } } @@ -6675,8 +6769,8 @@ static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) { if (!dev->dev_stats_valid) return; - printk_ratelimited_in_rcu(KERN_ERR "BTRFS: " - "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", + btrfs_err_rl_in_rcu(dev->dev_root->fs_info, + "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", rcu_str_deref(dev->name), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), @@ -6695,8 +6789,8 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) if (i == BTRFS_DEV_STAT_VALUES_MAX) return; /* all values == 0, suppress message */ - printk_in_rcu(KERN_INFO "BTRFS: " - "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", + btrfs_info_in_rcu(dev->dev_root->fs_info, + "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", rcu_str_deref(dev->name), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), @@ -6740,22 +6834,34 @@ int btrfs_get_dev_stats(struct btrfs_root *root, return 0; } -int btrfs_scratch_superblock(struct btrfs_device *device) +void btrfs_scratch_superblocks(struct block_device *bdev, char *device_path) { struct buffer_head *bh; struct btrfs_super_block *disk_super; + int copy_num; - bh = btrfs_read_dev_super(device->bdev); - if (!bh) - return -EINVAL; - disk_super = (struct btrfs_super_block *)bh->b_data; + if (!bdev) + return; - memset(&disk_super->magic, 0, sizeof(disk_super->magic)); - set_buffer_dirty(bh); - sync_dirty_buffer(bh); - brelse(bh); + for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; + copy_num++) { - return 0; + if (btrfs_read_dev_one_super(bdev, copy_num, &bh)) + continue; + + disk_super = (struct btrfs_super_block *)bh->b_data; + + memset(&disk_super->magic, 0, sizeof(disk_super->magic)); + set_buffer_dirty(bh); + sync_dirty_buffer(bh); + brelse(bh); + } + + /* Notify udev that device has changed */ + btrfs_kobject_uevent(bdev, KOBJ_CHANGE); + + /* Update ctime/mtime for device path for libblkid */ + update_dev_time(device_path); } /* @@ -6823,3 +6929,38 @@ void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info) fs_devices = fs_devices->seed; } } + +void btrfs_close_one_device(struct btrfs_device *device) +{ + struct btrfs_fs_devices *fs_devices = device->fs_devices; + struct btrfs_device *new_device; + struct rcu_string *name; + + if (device->bdev) + fs_devices->open_devices--; + + if (device->writeable && + device->devid != BTRFS_DEV_REPLACE_DEVID) { + list_del_init(&device->dev_alloc_list); + fs_devices->rw_devices--; + } + + if (device->missing) + fs_devices->missing_devices--; + + new_device = btrfs_alloc_device(NULL, &device->devid, + device->uuid); + BUG_ON(IS_ERR(new_device)); /* -ENOMEM */ + + /* Safe because we are under uuid_mutex */ + if (device->name) { + name = rcu_string_strdup(device->name->str, GFP_NOFS); + BUG_ON(!name); /* -ENOMEM */ + rcu_assign_pointer(new_device->name, name); + } + + list_replace_rcu(&device->dev_list, &new_device->dev_list); + new_device->fs_devices = device->fs_devices; + + call_rcu(&device->rcu, free_device); +} diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 595279a..ec57123 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -256,7 +256,7 @@ struct btrfs_fs_devices { struct btrfs_fs_info *fs_info; /* sysfs kobjects */ - struct kobject super_kobj; + struct kobject fsid_kobj; struct kobject *device_dir_kobj; struct completion kobj_unregister; }; @@ -334,10 +334,15 @@ struct btrfs_raid_attr { int dev_stripes; /* stripes per dev */ int devs_max; /* max devs to use */ int devs_min; /* min devs needed */ + int tolerated_failures; /* max tolerated fail devs */ int devs_increment; /* ndevs has to be a multiple of this */ int ncopies; /* how many copies to data has */ }; +extern const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES]; + +extern const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES]; + struct map_lookup { u64 type; int io_align; @@ -375,6 +380,9 @@ struct map_lookup { #define BTRFS_BALANCE_ARGS_DRANGE (1ULL << 3) #define BTRFS_BALANCE_ARGS_VRANGE (1ULL << 4) #define BTRFS_BALANCE_ARGS_LIMIT (1ULL << 5) +#define BTRFS_BALANCE_ARGS_LIMIT_RANGE (1ULL << 6) +#define BTRFS_BALANCE_ARGS_STRIPES_RANGE (1ULL << 7) +#define BTRFS_BALANCE_ARGS_USAGE_RANGE (1ULL << 8) #define BTRFS_BALANCE_ARGS_MASK \ (BTRFS_BALANCE_ARGS_PROFILES | \ @@ -382,7 +390,10 @@ struct map_lookup { BTRFS_BALANCE_ARGS_DEVID | \ BTRFS_BALANCE_ARGS_DRANGE | \ BTRFS_BALANCE_ARGS_VRANGE | \ - BTRFS_BALANCE_ARGS_LIMIT) + BTRFS_BALANCE_ARGS_LIMIT | \ + BTRFS_BALANCE_ARGS_LIMIT_RANGE | \ + BTRFS_BALANCE_ARGS_STRIPES_RANGE | \ + BTRFS_BALANCE_ARGS_USAGE_RANGE) /* * Profile changing flags. When SOFT is set we won't relocate chunk if @@ -482,7 +493,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, struct btrfs_device *tgtdev); void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, struct btrfs_device *tgtdev); -int btrfs_scratch_superblock(struct btrfs_device *device); +void btrfs_scratch_superblocks(struct block_device *bdev, char *device_path); int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len, int mirror_num); unsigned long btrfs_full_stripe_len(struct btrfs_root *root, @@ -555,5 +566,6 @@ static inline void unlock_chunks(struct btrfs_root *root) struct list_head *btrfs_get_fs_uuids(void); void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info); void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info); +void btrfs_close_one_device(struct btrfs_device *device); #endif diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 0b73af9..b4473da 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -1117,6 +1117,119 @@ DEFINE_EVENT(btrfs__workqueue_done, btrfs_workqueue_destroy, TP_ARGS(wq) ); +DECLARE_EVENT_CLASS(btrfs__qgroup_data_map, + + TP_PROTO(struct inode *inode, u64 free_reserved), + + TP_ARGS(inode, free_reserved), + + TP_STRUCT__entry( + __field( u64, rootid ) + __field( unsigned long, ino ) + __field( u64, free_reserved ) + ), + + TP_fast_assign( + __entry->rootid = BTRFS_I(inode)->root->objectid; + __entry->ino = inode->i_ino; + __entry->free_reserved = free_reserved; + ), + + TP_printk("rootid=%llu, ino=%lu, free_reserved=%llu", + __entry->rootid, __entry->ino, __entry->free_reserved) +); + +DEFINE_EVENT(btrfs__qgroup_data_map, btrfs_qgroup_init_data_rsv_map, + + TP_PROTO(struct inode *inode, u64 free_reserved), + + TP_ARGS(inode, free_reserved) +); + +DEFINE_EVENT(btrfs__qgroup_data_map, btrfs_qgroup_free_data_rsv_map, + + TP_PROTO(struct inode *inode, u64 free_reserved), + + TP_ARGS(inode, free_reserved) +); + +#define BTRFS_QGROUP_OPERATIONS \ + { QGROUP_RESERVE, "reserve" }, \ + { QGROUP_RELEASE, "release" }, \ + { QGROUP_FREE, "free" } + +DECLARE_EVENT_CLASS(btrfs__qgroup_rsv_data, + + TP_PROTO(struct inode *inode, u64 start, u64 len, u64 reserved, int op), + + TP_ARGS(inode, start, len, reserved, op), + + TP_STRUCT__entry( + __field( u64, rootid ) + __field( unsigned long, ino ) + __field( u64, start ) + __field( u64, len ) + __field( u64, reserved ) + __field( int, op ) + ), + + TP_fast_assign( + __entry->rootid = BTRFS_I(inode)->root->objectid; + __entry->ino = inode->i_ino; + __entry->start = start; + __entry->len = len; + __entry->reserved = reserved; + __entry->op = op; + ), + + TP_printk("root=%llu, ino=%lu, start=%llu, len=%llu, reserved=%llu, op=%s", + __entry->rootid, __entry->ino, __entry->start, __entry->len, + __entry->reserved, + __print_flags((unsigned long)__entry->op, "", + BTRFS_QGROUP_OPERATIONS) + ) +); + +DEFINE_EVENT(btrfs__qgroup_rsv_data, btrfs_qgroup_reserve_data, + + TP_PROTO(struct inode *inode, u64 start, u64 len, u64 reserved, int op), + + TP_ARGS(inode, start, len, reserved, op) +); + +DEFINE_EVENT(btrfs__qgroup_rsv_data, btrfs_qgroup_release_data, + + TP_PROTO(struct inode *inode, u64 start, u64 len, u64 reserved, int op), + + TP_ARGS(inode, start, len, reserved, op) +); + +DECLARE_EVENT_CLASS(btrfs__qgroup_delayed_ref, + + TP_PROTO(u64 ref_root, u64 reserved), + + TP_ARGS(ref_root, reserved), + + TP_STRUCT__entry( + __field( u64, ref_root ) + __field( u64, reserved ) + ), + + TP_fast_assign( + __entry->ref_root = ref_root; + __entry->reserved = reserved; + ), + + TP_printk("root=%llu, reserved=%llu, op=free", + __entry->ref_root, __entry->reserved) +); + +DEFINE_EVENT(btrfs__qgroup_delayed_ref, btrfs_qgroup_free_delayed_ref, + + TP_PROTO(u64 ref_root, u64 reserved), + + TP_ARGS(ref_root, reserved) +); #endif /* _TRACE_BTRFS_H */ /* This part must be outside protection */ diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index b6dec05..dea8931 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -206,7 +206,13 @@ struct btrfs_ioctl_feature_flags { */ struct btrfs_balance_args { __u64 profiles; - __u64 usage; + union { + __le64 usage; + struct { + __le32 usage_min; + __le32 usage_max; + }; + }; __u64 devid; __u64 pstart; __u64 pend; @@ -217,8 +223,27 @@ struct btrfs_balance_args { __u64 flags; - __u64 limit; /* limit number of processed chunks */ - __u64 unused[7]; + /* + * BTRFS_BALANCE_ARGS_LIMIT with value 'limit' + * BTRFS_BALANCE_ARGS_LIMIT_RANGE - the extend version can use minimum + * and maximum + */ + union { + __u64 limit; /* limit number of processed chunks */ + struct { + __u32 limit_min; + __u32 limit_max; + }; + }; + + /* + * Process chunks that cross stripes_min..stripes_max devices, + * BTRFS_BALANCE_ARGS_STRIPES_RANGE + */ + __le32 stripes_min; + __le32 stripes_max; + + __u64 unused[6]; } __attribute__ ((__packed__)); /* report balance progress to userspace */ |