diff options
Diffstat (limited to 'fs/btrfs')
35 files changed, 3995 insertions, 2308 deletions
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index f128427..3616042 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -27,7 +27,7 @@ #include "btrfs_inode.h" #include "xattr.h" -#ifdef CONFIG_FS_POSIX_ACL +#ifdef CONFIG_BTRFS_FS_POSIX_ACL static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) { @@ -313,7 +313,7 @@ struct xattr_handler btrfs_xattr_acl_access_handler = { .set = btrfs_xattr_acl_access_set, }; -#else /* CONFIG_FS_POSIX_ACL */ +#else /* CONFIG_BTRFS_FS_POSIX_ACL */ int btrfs_acl_chmod(struct inode *inode) { @@ -325,4 +325,4 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir) return 0; } -#endif /* CONFIG_FS_POSIX_ACL */ +#endif /* CONFIG_BTRFS_FS_POSIX_ACL */ diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 019e8af..c0861e7 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -48,6 +48,9 @@ struct btrfs_worker_thread { /* number of things on the pending list */ atomic_t num_pending; + /* reference counter for this struct */ + atomic_t refs; + unsigned long sequence; /* protects the pending list. */ @@ -61,6 +64,51 @@ struct btrfs_worker_thread { }; /* + * btrfs_start_workers uses kthread_run, which can block waiting for memory + * for a very long time. It will actually throttle on page writeback, + * and so it may not make progress until after our btrfs worker threads + * process all of the pending work structs in their queue + * + * This means we can't use btrfs_start_workers from inside a btrfs worker + * thread that is used as part of cleaning dirty memory, which pretty much + * involves all of the worker threads. + * + * Instead we have a helper queue who never has more than one thread + * where we scheduler thread start operations. This worker_start struct + * is used to contain the work and hold a pointer to the queue that needs + * another worker. + */ +struct worker_start { + struct btrfs_work work; + struct btrfs_workers *queue; +}; + +static void start_new_worker_func(struct btrfs_work *work) +{ + struct worker_start *start; + start = container_of(work, struct worker_start, work); + btrfs_start_workers(start->queue, 1); + kfree(start); +} + +static int start_new_worker(struct btrfs_workers *queue) +{ + struct worker_start *start; + int ret; + + start = kzalloc(sizeof(*start), GFP_NOFS); + if (!start) + return -ENOMEM; + + start->work.func = start_new_worker_func; + start->queue = queue; + ret = btrfs_queue_worker(queue->atomic_worker_start, &start->work); + if (ret) + kfree(start); + return ret; +} + +/* * helper function to move a thread onto the idle list after it * has finished some requests. */ @@ -71,7 +119,12 @@ static void check_idle_worker(struct btrfs_worker_thread *worker) unsigned long flags; spin_lock_irqsave(&worker->workers->lock, flags); worker->idle = 1; - list_move(&worker->worker_list, &worker->workers->idle_list); + + /* the list may be empty if the worker is just starting */ + if (!list_empty(&worker->worker_list)) { + list_move(&worker->worker_list, + &worker->workers->idle_list); + } spin_unlock_irqrestore(&worker->workers->lock, flags); } } @@ -87,23 +140,51 @@ static void check_busy_worker(struct btrfs_worker_thread *worker) unsigned long flags; spin_lock_irqsave(&worker->workers->lock, flags); worker->idle = 0; - list_move_tail(&worker->worker_list, - &worker->workers->worker_list); + + if (!list_empty(&worker->worker_list)) { + list_move_tail(&worker->worker_list, + &worker->workers->worker_list); + } spin_unlock_irqrestore(&worker->workers->lock, flags); } } -static noinline int run_ordered_completions(struct btrfs_workers *workers, - struct btrfs_work *work) +static void check_pending_worker_creates(struct btrfs_worker_thread *worker) { + struct btrfs_workers *workers = worker->workers; unsigned long flags; + rmb(); + if (!workers->atomic_start_pending) + return; + + spin_lock_irqsave(&workers->lock, flags); + if (!workers->atomic_start_pending) + goto out; + + workers->atomic_start_pending = 0; + if (workers->num_workers + workers->num_workers_starting >= + workers->max_workers) + goto out; + + workers->num_workers_starting += 1; + spin_unlock_irqrestore(&workers->lock, flags); + start_new_worker(workers); + return; + +out: + spin_unlock_irqrestore(&workers->lock, flags); +} + +static noinline int run_ordered_completions(struct btrfs_workers *workers, + struct btrfs_work *work) +{ if (!workers->ordered) return 0; set_bit(WORK_DONE_BIT, &work->flags); - spin_lock_irqsave(&workers->lock, flags); + spin_lock(&workers->order_lock); while (1) { if (!list_empty(&workers->prio_order_list)) { @@ -126,45 +207,118 @@ static noinline int run_ordered_completions(struct btrfs_workers *workers, if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags)) break; - spin_unlock_irqrestore(&workers->lock, flags); + spin_unlock(&workers->order_lock); work->ordered_func(work); /* now take the lock again and call the freeing code */ - spin_lock_irqsave(&workers->lock, flags); + spin_lock(&workers->order_lock); list_del(&work->order_list); work->ordered_free(work); } - spin_unlock_irqrestore(&workers->lock, flags); + spin_unlock(&workers->order_lock); return 0; } +static void put_worker(struct btrfs_worker_thread *worker) +{ + if (atomic_dec_and_test(&worker->refs)) + kfree(worker); +} + +static int try_worker_shutdown(struct btrfs_worker_thread *worker) +{ + int freeit = 0; + + spin_lock_irq(&worker->lock); + spin_lock(&worker->workers->lock); + if (worker->workers->num_workers > 1 && + worker->idle && + !worker->working && + !list_empty(&worker->worker_list) && + list_empty(&worker->prio_pending) && + list_empty(&worker->pending) && + atomic_read(&worker->num_pending) == 0) { + freeit = 1; + list_del_init(&worker->worker_list); + worker->workers->num_workers--; + } + spin_unlock(&worker->workers->lock); + spin_unlock_irq(&worker->lock); + + if (freeit) + put_worker(worker); + return freeit; +} + +static struct btrfs_work *get_next_work(struct btrfs_worker_thread *worker, + struct list_head *prio_head, + struct list_head *head) +{ + struct btrfs_work *work = NULL; + struct list_head *cur = NULL; + + if(!list_empty(prio_head)) + cur = prio_head->next; + + smp_mb(); + if (!list_empty(&worker->prio_pending)) + goto refill; + + if (!list_empty(head)) + cur = head->next; + + if (cur) + goto out; + +refill: + spin_lock_irq(&worker->lock); + list_splice_tail_init(&worker->prio_pending, prio_head); + list_splice_tail_init(&worker->pending, head); + + if (!list_empty(prio_head)) + cur = prio_head->next; + else if (!list_empty(head)) + cur = head->next; + spin_unlock_irq(&worker->lock); + + if (!cur) + goto out_fail; + +out: + work = list_entry(cur, struct btrfs_work, list); + +out_fail: + return work; +} + /* * main loop for servicing work items */ static int worker_loop(void *arg) { struct btrfs_worker_thread *worker = arg; - struct list_head *cur; + struct list_head head; + struct list_head prio_head; struct btrfs_work *work; + + INIT_LIST_HEAD(&head); + INIT_LIST_HEAD(&prio_head); + do { - spin_lock_irq(&worker->lock); -again_locked: +again: while (1) { - if (!list_empty(&worker->prio_pending)) - cur = worker->prio_pending.next; - else if (!list_empty(&worker->pending)) - cur = worker->pending.next; - else + + + work = get_next_work(worker, &prio_head, &head); + if (!work) break; - work = list_entry(cur, struct btrfs_work, list); list_del(&work->list); clear_bit(WORK_QUEUED_BIT, &work->flags); work->worker = worker; - spin_unlock_irq(&worker->lock); work->func(work); @@ -175,9 +329,13 @@ again_locked: */ run_ordered_completions(worker->workers, work); - spin_lock_irq(&worker->lock); - check_idle_worker(worker); + check_pending_worker_creates(worker); + } + + spin_lock_irq(&worker->lock); + check_idle_worker(worker); + if (freezing(current)) { worker->working = 0; spin_unlock_irq(&worker->lock); @@ -216,8 +374,10 @@ again_locked: spin_lock_irq(&worker->lock); set_current_state(TASK_INTERRUPTIBLE); if (!list_empty(&worker->pending) || - !list_empty(&worker->prio_pending)) - goto again_locked; + !list_empty(&worker->prio_pending)) { + spin_unlock_irq(&worker->lock); + goto again; + } /* * this makes sure we get a wakeup when someone @@ -226,8 +386,13 @@ again_locked: worker->working = 0; spin_unlock_irq(&worker->lock); - if (!kthread_should_stop()) - schedule(); + if (!kthread_should_stop()) { + schedule_timeout(HZ * 120); + if (!worker->working && + try_worker_shutdown(worker)) { + return 0; + } + } } __set_current_state(TASK_RUNNING); } @@ -242,41 +407,61 @@ int btrfs_stop_workers(struct btrfs_workers *workers) { struct list_head *cur; struct btrfs_worker_thread *worker; + int can_stop; + spin_lock_irq(&workers->lock); list_splice_init(&workers->idle_list, &workers->worker_list); while (!list_empty(&workers->worker_list)) { cur = workers->worker_list.next; worker = list_entry(cur, struct btrfs_worker_thread, worker_list); - kthread_stop(worker->task); - list_del(&worker->worker_list); - kfree(worker); + + atomic_inc(&worker->refs); + workers->num_workers -= 1; + if (!list_empty(&worker->worker_list)) { + list_del_init(&worker->worker_list); + put_worker(worker); + can_stop = 1; + } else + can_stop = 0; + spin_unlock_irq(&workers->lock); + if (can_stop) + kthread_stop(worker->task); + spin_lock_irq(&workers->lock); + put_worker(worker); } + spin_unlock_irq(&workers->lock); return 0; } /* * simple init on struct btrfs_workers */ -void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max) +void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, + struct btrfs_workers *async_helper) { workers->num_workers = 0; + workers->num_workers_starting = 0; INIT_LIST_HEAD(&workers->worker_list); INIT_LIST_HEAD(&workers->idle_list); INIT_LIST_HEAD(&workers->order_list); INIT_LIST_HEAD(&workers->prio_order_list); spin_lock_init(&workers->lock); + spin_lock_init(&workers->order_lock); workers->max_workers = max; workers->idle_thresh = 32; workers->name = name; workers->ordered = 0; + workers->atomic_start_pending = 0; + workers->atomic_worker_start = async_helper; } /* * starts new worker threads. This does not enforce the max worker * count in case you need to temporarily go past it. */ -int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) +static int __btrfs_start_workers(struct btrfs_workers *workers, + int num_workers) { struct btrfs_worker_thread *worker; int ret = 0; @@ -293,7 +478,9 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) INIT_LIST_HEAD(&worker->prio_pending); INIT_LIST_HEAD(&worker->worker_list); spin_lock_init(&worker->lock); + atomic_set(&worker->num_pending, 0); + atomic_set(&worker->refs, 1); worker->workers = workers; worker->task = kthread_run(worker_loop, worker, "btrfs-%s-%d", workers->name, @@ -303,11 +490,12 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) kfree(worker); goto fail; } - spin_lock_irq(&workers->lock); list_add_tail(&worker->worker_list, &workers->idle_list); worker->idle = 1; workers->num_workers++; + workers->num_workers_starting--; + WARN_ON(workers->num_workers_starting < 0); spin_unlock_irq(&workers->lock); } return 0; @@ -316,6 +504,14 @@ fail: return ret; } +int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) +{ + spin_lock_irq(&workers->lock); + workers->num_workers_starting += num_workers; + spin_unlock_irq(&workers->lock); + return __btrfs_start_workers(workers, num_workers); +} + /* * run through the list and find a worker thread that doesn't have a lot * to do right now. This can return null if we aren't yet at the thread @@ -325,7 +521,10 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers) { struct btrfs_worker_thread *worker; struct list_head *next; - int enforce_min = workers->num_workers < workers->max_workers; + int enforce_min; + + enforce_min = (workers->num_workers + workers->num_workers_starting) < + workers->max_workers; /* * if we find an idle thread, don't move it to the end of the @@ -350,7 +549,6 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers) */ next = workers->worker_list.next; worker = list_entry(next, struct btrfs_worker_thread, worker_list); - atomic_inc(&worker->num_pending); worker->sequence++; if (worker->sequence % workers->idle_thresh == 0) @@ -367,35 +565,49 @@ static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers) { struct btrfs_worker_thread *worker; unsigned long flags; + struct list_head *fallback; again: spin_lock_irqsave(&workers->lock, flags); worker = next_worker(workers); - spin_unlock_irqrestore(&workers->lock, flags); if (!worker) { - spin_lock_irqsave(&workers->lock, flags); - if (workers->num_workers >= workers->max_workers) { - struct list_head *fallback = NULL; - /* - * we have failed to find any workers, just - * return the force one - */ - if (!list_empty(&workers->worker_list)) - fallback = workers->worker_list.next; - if (!list_empty(&workers->idle_list)) - fallback = workers->idle_list.next; - BUG_ON(!fallback); - worker = list_entry(fallback, - struct btrfs_worker_thread, worker_list); - spin_unlock_irqrestore(&workers->lock, flags); + if (workers->num_workers + workers->num_workers_starting >= + workers->max_workers) { + goto fallback; + } else if (workers->atomic_worker_start) { + workers->atomic_start_pending = 1; + goto fallback; } else { + workers->num_workers_starting++; spin_unlock_irqrestore(&workers->lock, flags); /* we're below the limit, start another worker */ - btrfs_start_workers(workers, 1); + __btrfs_start_workers(workers, 1); goto again; } } + goto found; + +fallback: + fallback = NULL; + /* + * we have failed to find any workers, just + * return the first one we can find. + */ + if (!list_empty(&workers->worker_list)) + fallback = workers->worker_list.next; + if (!list_empty(&workers->idle_list)) + fallback = workers->idle_list.next; + BUG_ON(!fallback); + worker = list_entry(fallback, + struct btrfs_worker_thread, worker_list); +found: + /* + * this makes sure the worker doesn't exit before it is placed + * onto a busy/idle list + */ + atomic_inc(&worker->num_pending); + spin_unlock_irqrestore(&workers->lock, flags); return worker; } @@ -427,7 +639,7 @@ int btrfs_requeue_work(struct btrfs_work *work) spin_lock(&worker->workers->lock); worker->idle = 0; list_move_tail(&worker->worker_list, - &worker->workers->worker_list); + &worker->workers->worker_list); spin_unlock(&worker->workers->lock); } if (!worker->working) { @@ -435,9 +647,9 @@ int btrfs_requeue_work(struct btrfs_work *work) worker->working = 1; } - spin_unlock_irqrestore(&worker->lock, flags); if (wake) wake_up_process(worker->task); + spin_unlock_irqrestore(&worker->lock, flags); out: return 0; @@ -463,14 +675,18 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) worker = find_worker(workers); if (workers->ordered) { - spin_lock_irqsave(&workers->lock, flags); + /* + * you're not allowed to do ordered queues from an + * interrupt handler + */ + spin_lock(&workers->order_lock); if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) { list_add_tail(&work->order_list, &workers->prio_order_list); } else { list_add_tail(&work->order_list, &workers->order_list); } - spin_unlock_irqrestore(&workers->lock, flags); + spin_unlock(&workers->order_lock); } else { INIT_LIST_HEAD(&work->order_list); } @@ -481,7 +697,6 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) list_add_tail(&work->list, &worker->prio_pending); else list_add_tail(&work->list, &worker->pending); - atomic_inc(&worker->num_pending); check_busy_worker(worker); /* @@ -492,10 +707,10 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) wake = 1; worker->working = 1; - spin_unlock_irqrestore(&worker->lock, flags); - if (wake) wake_up_process(worker->task); + spin_unlock_irqrestore(&worker->lock, flags); + out: return 0; } diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index 1b511c1..5077746 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -64,6 +64,8 @@ struct btrfs_workers { /* current number of running workers */ int num_workers; + int num_workers_starting; + /* max number of workers allowed. changed by btrfs_start_workers */ int max_workers; @@ -73,6 +75,16 @@ struct btrfs_workers { /* force completions in the order they were queued */ int ordered; + /* more workers required, but in an interrupt handler */ + int atomic_start_pending; + + /* + * are we allowed to sleep while starting workers or are we required + * to start them at a later time? If we can't sleep, this indicates + * which queue we need to use to schedule thread creation. + */ + struct btrfs_workers *atomic_worker_start; + /* list with all the work threads. The workers on the idle thread * may be actively servicing jobs, but they haven't yet hit the * idle thresh limit above. @@ -90,6 +102,9 @@ struct btrfs_workers { /* lock for finding the next worker thread to queue on */ spinlock_t lock; + /* lock for the ordered lists */ + spinlock_t order_lock; + /* extra name for this worker, used for current->name */ char *name; }; @@ -97,7 +112,8 @@ struct btrfs_workers { int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work); int btrfs_start_workers(struct btrfs_workers *workers, int num_workers); int btrfs_stop_workers(struct btrfs_workers *workers); -void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max); +void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max, + struct btrfs_workers *async_starter); int btrfs_requeue_work(struct btrfs_work *work); void btrfs_set_work_high_prio(struct btrfs_work *work); #endif diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index ea1ea0a..f6783a4 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -86,6 +86,12 @@ struct btrfs_inode { * transid of the trans_handle that last modified this inode */ u64 last_trans; + + /* + * log transid when this inode was last modified + */ + u64 last_sub_trans; + /* * transid that last logged this inode */ @@ -128,6 +134,16 @@ struct btrfs_inode { u64 last_unlink_trans; /* + * Counters to keep track of the number of extent item's we may use due + * to delalloc and such. outstanding_extents is the number of extent + * items we think we'll end up using, and reserved_extents is the number + * of extent items we've reserved metadata for. + */ + spinlock_t accounting_lock; + int reserved_extents; + int outstanding_extents; + + /* * ordered_data_close is set by truncate when a file that used * to have good data has been truncated to zero. When it is set * the btrfs file release call will add this inode to the @@ -138,6 +154,7 @@ struct btrfs_inode { * of these. */ unsigned ordered_data_close:1; + unsigned dummy_inode:1; struct inode vfs_inode; }; diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 9d8ba4d..a11a320 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -506,10 +506,10 @@ static noinline int add_ra_bio_pages(struct inode *inode, */ set_page_extent_mapped(page); lock_extent(tree, last_offset, end, GFP_NOFS); - spin_lock(&em_tree->lock); + read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, last_offset, PAGE_CACHE_SIZE); - spin_unlock(&em_tree->lock); + read_unlock(&em_tree->lock); if (!em || last_offset < em->start || (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) || @@ -593,11 +593,11 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, em_tree = &BTRFS_I(inode)->extent_tree; /* we need the actual starting offset of this extent in the file */ - spin_lock(&em_tree->lock); + read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, page_offset(bio->bi_io_vec->bv_page), PAGE_CACHE_SIZE); - spin_unlock(&em_tree->lock); + read_unlock(&em_tree->lock); compressed_len = em->block_len; cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 3fdcc05..ec96f3a 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2853,6 +2853,12 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, int split; int num_doubles = 0; + l = path->nodes[0]; + slot = path->slots[0]; + if (extend && data_size + btrfs_item_size_nr(l, slot) + + sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root)) + return -EOVERFLOW; + /* first try to make some room by pushing left and right */ if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) { wret = push_leaf_right(trans, root, path, data_size, 0); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 837435c..444b3e9 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -114,6 +114,10 @@ struct btrfs_ordered_sum; */ #define BTRFS_DEV_ITEMS_OBJECTID 1ULL +#define BTRFS_BTREE_INODE_OBJECTID 1 + +#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2 + /* * we can actually store much bigger names, but lets not confuse the rest * of linux @@ -670,21 +674,29 @@ struct btrfs_space_info { u64 bytes_reserved; /* total bytes the allocator has reserved for current allocations */ u64 bytes_readonly; /* total bytes that are read only */ - - /* delalloc accounting */ - u64 bytes_delalloc; /* number of bytes reserved for allocation, - this space is not necessarily reserved yet - by the allocator */ + u64 bytes_super; /* total bytes reserved for the super blocks */ + u64 bytes_root; /* the number of bytes needed to commit a + transaction */ u64 bytes_may_use; /* number of bytes that may be used for - delalloc */ + delalloc/allocations */ + u64 bytes_delalloc; /* number of bytes currently reserved for + delayed allocation */ int full; /* indicates that we cannot allocate any more chunks for this space */ int force_alloc; /* set if we need to force a chunk alloc for this space */ + int force_delalloc; /* make people start doing filemap_flush until + we're under a threshold */ struct list_head list; + /* for controlling how we free up space for allocations */ + wait_queue_head_t allocate_wait; + wait_queue_head_t flush_wait; + int allocating_chunk; + int flushing; + /* for block groups in our same type */ struct list_head block_groups; spinlock_t lock; @@ -726,6 +738,15 @@ enum btrfs_caching_type { BTRFS_CACHE_FINISHED = 2, }; +struct btrfs_caching_control { + struct list_head list; + struct mutex mutex; + wait_queue_head_t wait; + struct btrfs_block_group_cache *block_group; + u64 progress; + atomic_t count; +}; + struct btrfs_block_group_cache { struct btrfs_key key; struct btrfs_block_group_item item; @@ -733,6 +754,7 @@ struct btrfs_block_group_cache { spinlock_t lock; u64 pinned; u64 reserved; + u64 bytes_super; u64 flags; u64 sectorsize; int extents_thresh; @@ -742,8 +764,9 @@ struct btrfs_block_group_cache { int dirty; /* cache tracking stuff */ - wait_queue_head_t caching_q; int cached; + struct btrfs_caching_control *caching_ctl; + u64 last_byte_to_unpin; struct btrfs_space_info *space_info; @@ -782,13 +805,16 @@ struct btrfs_fs_info { /* the log root tree is a directory of all the other log roots */ struct btrfs_root *log_root_tree; + + spinlock_t fs_roots_radix_lock; struct radix_tree_root fs_roots_radix; /* block group cache stuff */ spinlock_t block_group_cache_lock; struct rb_root block_group_cache_tree; - struct extent_io_tree pinned_extents; + struct extent_io_tree freed_extents[2]; + struct extent_io_tree *pinned_extents; /* logical->physical extent mapping */ struct btrfs_mapping_tree mapping_tree; @@ -822,11 +848,7 @@ struct btrfs_fs_info { struct mutex transaction_kthread_mutex; struct mutex cleaner_mutex; struct mutex chunk_mutex; - struct mutex drop_mutex; struct mutex volume_mutex; - struct mutex tree_reloc_mutex; - struct rw_semaphore extent_commit_sem; - /* * this protects the ordered operations list only while we are * processing all of the entries on it. This way we make @@ -835,10 +857,16 @@ struct btrfs_fs_info { * before jumping into the main commit. */ struct mutex ordered_operations_mutex; + struct rw_semaphore extent_commit_sem; + + struct rw_semaphore subvol_sem; + + struct srcu_struct subvol_srcu; struct list_head trans_list; struct list_head hashers; struct list_head dead_roots; + struct list_head caching_block_groups; atomic_t nr_async_submits; atomic_t async_submit_draining; @@ -882,6 +910,7 @@ struct btrfs_fs_info { * A third pool does submit_bio to avoid deadlocking with the other * two */ + struct btrfs_workers generic_worker; struct btrfs_workers workers; struct btrfs_workers delalloc_workers; struct btrfs_workers endio_workers; @@ -889,6 +918,7 @@ struct btrfs_fs_info { struct btrfs_workers endio_meta_write_workers; struct btrfs_workers endio_write_workers; struct btrfs_workers submit_workers; + struct btrfs_workers enospc_workers; /* * fixup workers take dirty pages that didn't properly go through * the cow mechanism and make them safe to write. It happens @@ -979,7 +1009,10 @@ struct btrfs_root { atomic_t log_writers; atomic_t log_commit[2]; unsigned long log_transid; + unsigned long last_log_commit; unsigned long log_batch; + pid_t log_start_pid; + bool log_multiple_pids; u64 objectid; u64 last_trans; @@ -996,10 +1029,12 @@ struct btrfs_root { u32 stripesize; u32 type; - u64 highest_inode; - u64 last_inode_alloc; + + u64 highest_objectid; int ref_cows; int track_dirty; + int in_radix; + u64 defrag_trans_start; struct btrfs_key defrag_progress; struct btrfs_key defrag_max; @@ -1118,6 +1153,7 @@ struct btrfs_root { #define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7) #define BTRFS_MOUNT_SSD_SPREAD (1 << 8) #define BTRFS_MOUNT_NOSSD (1 << 9) +#define BTRFS_MOUNT_DISCARD (1 << 10) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) @@ -1920,8 +1956,8 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache); int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root, unsigned long count); int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); -int btrfs_update_pinned_extents(struct btrfs_root *root, - u64 bytenr, u64 num, int pin); +int btrfs_pin_extent(struct btrfs_root *root, + u64 bytenr, u64 num, int reserved); int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *leaf); int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, @@ -1971,9 +2007,10 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, u64 root_objectid, u64 owner, u64 offset); int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); +int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, + struct btrfs_root *root); int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_io_tree *unpin); + struct btrfs_root *root); int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, @@ -1984,6 +2021,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr); int btrfs_free_block_groups(struct btrfs_fs_info *info); int btrfs_read_block_groups(struct btrfs_root *root); +int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr); int btrfs_make_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytes_used, u64 type, u64 chunk_objectid, u64 chunk_offset, @@ -1997,7 +2035,12 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); -int btrfs_check_metadata_free_space(struct btrfs_root *root); +int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items); +int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items); +int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root, + struct inode *inode, int num_items); +int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root, + struct inode *inode, int num_items); int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, u64 bytes); void btrfs_free_reserved_data_space(struct btrfs_root *root, @@ -2006,7 +2049,6 @@ void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode, u64 bytes); void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, u64 bytes); -void btrfs_free_pinned_extents(struct btrfs_fs_info *info); /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, int level, int *slot); @@ -2100,12 +2142,15 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct extent_buffer *parent); /* root-item.c */ int btrfs_find_root_ref(struct btrfs_root *tree_root, - struct btrfs_path *path, - u64 root_id, u64 ref_id); + struct btrfs_path *path, + u64 root_id, u64 ref_id); int btrfs_add_root_ref(struct btrfs_trans_handle *trans, struct btrfs_root *tree_root, - u64 root_id, u8 type, u64 ref_id, - u64 dirid, u64 sequence, + u64 root_id, u64 ref_id, u64 dirid, u64 sequence, + const char *name, int name_len); +int btrfs_del_root_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *tree_root, + u64 root_id, u64 ref_id, u64 dirid, u64 *sequence, const char *name, int name_len); int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_key *key); @@ -2120,6 +2165,7 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct int btrfs_search_root(struct btrfs_root *root, u64 search_start, u64 *found_objectid); int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid); +int btrfs_find_orphan_roots(struct btrfs_root *tree_root); int btrfs_set_root_node(struct btrfs_root_item *item, struct extent_buffer *node); /* dir-item.c */ @@ -2138,6 +2184,10 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, struct btrfs_path *path, u64 dir, u64 objectid, const char *name, int name_len, int mod); +struct btrfs_dir_item * +btrfs_search_dir_index_item(struct btrfs_root *root, + struct btrfs_path *path, u64 dirid, + const char *name, int name_len); struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, struct btrfs_path *path, const char *name, int name_len); @@ -2160,6 +2210,7 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset); int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset); +int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset); /* inode-map.c */ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans, @@ -2232,6 +2283,10 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, int btrfs_add_link(struct btrfs_trans_handle *trans, struct inode *parent_inode, struct inode *inode, const char *name, int name_len, int add_backref, u64 index); +int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *dir, u64 objectid, + const char *name, int name_len); int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, u64 new_size, @@ -2242,7 +2297,7 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end); int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc); int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, - struct btrfs_root *new_root, struct dentry *dentry, + struct btrfs_root *new_root, u64 new_dirid, u64 alloc_hint); int btrfs_merge_bio_hook(struct page *page, unsigned long offset, size_t size, struct bio *bio, unsigned long bio_flags); @@ -2258,6 +2313,7 @@ int btrfs_write_inode(struct inode *inode, int wait); void btrfs_dirty_inode(struct inode *inode); struct inode *btrfs_alloc_inode(struct super_block *sb); void btrfs_destroy_inode(struct inode *inode); +void btrfs_drop_inode(struct inode *inode); int btrfs_init_cachep(void); void btrfs_destroy_cachep(void); long btrfs_ioctl_trans_end(struct file *file); @@ -2275,6 +2331,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode); int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); void btrfs_orphan_cleanup(struct btrfs_root *root); int btrfs_cont_expand(struct inode *inode, loff_t size); +int btrfs_invalidate_inodes(struct btrfs_root *root); +extern const struct dentry_operations btrfs_dentry_operations; /* ioctl.c */ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); @@ -2286,11 +2344,11 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync); int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, int skip_pinned); int btrfs_check_file(struct btrfs_root *root, struct inode *inode); -extern struct file_operations btrfs_file_operations; +extern const struct file_operations btrfs_file_operations; int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, u64 start, u64 end, u64 locked_end, - u64 inline_limit, u64 *hint_block); + u64 inline_limit, u64 *hint_block, int drop_cache); int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, u64 start, u64 end); @@ -2317,7 +2375,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options); int btrfs_sync_fs(struct super_block *sb, int wait); /* acl.c */ -#ifdef CONFIG_FS_POSIX_ACL +#ifdef CONFIG_BTRFS_FS_POSIX_ACL int btrfs_check_acl(struct inode *inode, int mask); #else #define btrfs_check_acl NULL diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 1d70236..f3a6075 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -281,6 +281,53 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, return btrfs_match_dir_item_name(root, path, name, name_len); } +struct btrfs_dir_item * +btrfs_search_dir_index_item(struct btrfs_root *root, + struct btrfs_path *path, u64 dirid, + const char *name, int name_len) +{ + struct extent_buffer *leaf; + struct btrfs_dir_item *di; + struct btrfs_key key; + u32 nritems; + int ret; + + key.objectid = dirid; + key.type = BTRFS_DIR_INDEX_KEY; + key.offset = 0; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + return ERR_PTR(ret); + + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + + while (1) { + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + return ERR_PTR(ret); + if (ret > 0) + break; + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + continue; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY) + break; + + di = btrfs_match_dir_item_name(root, path, name, name_len); + if (di) + return di; + + path->slots[0]++; + } + return NULL; +} + struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dir, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 6c41731..02b6afb 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -41,6 +41,7 @@ static struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); +static void free_fs_root(struct btrfs_root *root); static atomic_t btrfs_bdi_num = ATOMIC_INIT(0); @@ -123,15 +124,15 @@ static struct extent_map *btree_get_extent(struct inode *inode, struct extent_map *em; int ret; - spin_lock(&em_tree->lock); + read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, len); if (em) { em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; - spin_unlock(&em_tree->lock); + read_unlock(&em_tree->lock); goto out; } - spin_unlock(&em_tree->lock); + read_unlock(&em_tree->lock); em = alloc_extent_map(GFP_NOFS); if (!em) { @@ -144,7 +145,7 @@ static struct extent_map *btree_get_extent(struct inode *inode, em->block_start = 0; em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; - spin_lock(&em_tree->lock); + write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); if (ret == -EEXIST) { u64 failed_start = em->start; @@ -163,7 +164,7 @@ static struct extent_map *btree_get_extent(struct inode *inode, free_extent_map(em); em = NULL; } - spin_unlock(&em_tree->lock); + write_unlock(&em_tree->lock); if (ret) em = ERR_PTR(ret); @@ -821,14 +822,14 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, int btrfs_write_tree_block(struct extent_buffer *buf) { - return btrfs_fdatawrite_range(buf->first_page->mapping, buf->start, - buf->start + buf->len - 1, WB_SYNC_ALL); + return filemap_fdatawrite_range(buf->first_page->mapping, buf->start, + buf->start + buf->len - 1); } int btrfs_wait_tree_block_writeback(struct extent_buffer *buf) { - return btrfs_wait_on_page_writeback_range(buf->first_page->mapping, - buf->start, buf->start + buf->len - 1); + return filemap_fdatawait_range(buf->first_page->mapping, + buf->start, buf->start + buf->len - 1); } struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, @@ -895,8 +896,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root->fs_info = fs_info; root->objectid = objectid; root->last_trans = 0; - root->highest_inode = 0; - root->last_inode_alloc = 0; + root->highest_objectid = 0; root->name = NULL; root->in_sysfs = 0; root->inode_tree.rb_node = NULL; @@ -917,6 +917,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, atomic_set(&root->log_writers, 0); root->log_batch = 0; root->log_transid = 0; + root->last_log_commit = 0; extent_io_tree_init(&root->dirty_log_pages, fs_info->btree_inode->i_mapping, GFP_NOFS); @@ -952,14 +953,16 @@ static int find_and_setup_root(struct btrfs_root *tree_root, root, fs_info, objectid); ret = btrfs_find_last_root(tree_root, objectid, &root->root_item, &root->root_key); + if (ret > 0) + return -ENOENT; BUG_ON(ret); generation = btrfs_root_generation(&root->root_item); blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), blocksize, generation); - root->commit_root = btrfs_root_node(root); BUG_ON(!root->node); + root->commit_root = btrfs_root_node(root); return 0; } @@ -1085,6 +1088,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, WARN_ON(root->log_root); root->log_root = log_root; root->log_transid = 0; + root->last_log_commit = 0; return 0; } @@ -1095,7 +1099,6 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, struct btrfs_fs_info *fs_info = tree_root->fs_info; struct btrfs_path *path; struct extent_buffer *l; - u64 highest_inode; u64 generation; u32 blocksize; int ret = 0; @@ -1110,7 +1113,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, kfree(root); return ERR_PTR(ret); } - goto insert; + goto out; } __setup_root(tree_root->nodesize, tree_root->leafsize, @@ -1120,39 +1123,30 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, path = btrfs_alloc_path(); BUG_ON(!path); ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0); - if (ret != 0) { - if (ret > 0) - ret = -ENOENT; - goto out; + if (ret == 0) { + l = path->nodes[0]; + read_extent_buffer(l, &root->root_item, + btrfs_item_ptr_offset(l, path->slots[0]), + sizeof(root->root_item)); + memcpy(&root->root_key, location, sizeof(*location)); } - l = path->nodes[0]; - read_extent_buffer(l, &root->root_item, - btrfs_item_ptr_offset(l, path->slots[0]), - sizeof(root->root_item)); - memcpy(&root->root_key, location, sizeof(*location)); - ret = 0; -out: - btrfs_release_path(root, path); btrfs_free_path(path); if (ret) { - kfree(root); + if (ret > 0) + ret = -ENOENT; return ERR_PTR(ret); } + generation = btrfs_root_generation(&root->root_item); blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), blocksize, generation); root->commit_root = btrfs_root_node(root); BUG_ON(!root->node); -insert: - if (location->objectid != BTRFS_TREE_LOG_OBJECTID) { +out: + if (location->objectid != BTRFS_TREE_LOG_OBJECTID) root->ref_cows = 1; - ret = btrfs_find_highest_inode(root, &highest_inode); - if (ret == 0) { - root->highest_inode = highest_inode; - root->last_inode_alloc = highest_inode; - } - } + return root; } @@ -1187,39 +1181,66 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, return fs_info->dev_root; if (location->objectid == BTRFS_CSUM_TREE_OBJECTID) return fs_info->csum_root; - +again: + spin_lock(&fs_info->fs_roots_radix_lock); root = radix_tree_lookup(&fs_info->fs_roots_radix, (unsigned long)location->objectid); + spin_unlock(&fs_info->fs_roots_radix_lock); if (root) return root; + ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid); + if (ret == 0) + ret = -ENOENT; + if (ret < 0) + return ERR_PTR(ret); + root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location); if (IS_ERR(root)) return root; + WARN_ON(btrfs_root_refs(&root->root_item) == 0); set_anon_super(&root->anon_super, NULL); + ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); + if (ret) + goto fail; + + spin_lock(&fs_info->fs_roots_radix_lock); ret = radix_tree_insert(&fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid, root); + if (ret == 0) + root->in_radix = 1; + spin_unlock(&fs_info->fs_roots_radix_lock); + radix_tree_preload_end(); if (ret) { - free_extent_buffer(root->node); - kfree(root); - return ERR_PTR(ret); + if (ret == -EEXIST) { + free_fs_root(root); + goto again; + } + goto fail; } - if (!(fs_info->sb->s_flags & MS_RDONLY)) { - ret = btrfs_find_dead_roots(fs_info->tree_root, - root->root_key.objectid); - BUG_ON(ret); + + ret = btrfs_find_dead_roots(fs_info->tree_root, + root->root_key.objectid); + WARN_ON(ret); + + if (!(fs_info->sb->s_flags & MS_RDONLY)) btrfs_orphan_cleanup(root); - } + return root; +fail: + free_fs_root(root); + return ERR_PTR(ret); } struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_key *location, const char *name, int namelen) { + return btrfs_read_fs_root_no_name(fs_info, location); +#if 0 struct btrfs_root *root; int ret; @@ -1236,7 +1257,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info, kfree(root); return ERR_PTR(ret); } -#if 0 + ret = btrfs_sysfs_add_root(root); if (ret) { free_extent_buffer(root->node); @@ -1244,9 +1265,9 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info, kfree(root); return ERR_PTR(ret); } -#endif root->in_sysfs = 1; return root; +#endif } static int btrfs_congested_fn(void *congested_data, int bdi_bits) @@ -1325,9 +1346,9 @@ static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) offset = page_offset(page); em_tree = &BTRFS_I(inode)->extent_tree; - spin_lock(&em_tree->lock); + read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE); - spin_unlock(&em_tree->lock); + read_unlock(&em_tree->lock); if (!em) { __unplug_io_fn(bdi, page); return; @@ -1360,8 +1381,10 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) err = bdi_register(bdi, NULL, "btrfs-%d", atomic_inc_return(&btrfs_bdi_num)); - if (err) + if (err) { + bdi_destroy(bdi); return err; + } bdi->ra_pages = default_backing_dev_info.ra_pages; bdi->unplug_io_fn = btrfs_unplug_io_fn; @@ -1451,9 +1474,12 @@ static int cleaner_kthread(void *arg) break; vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); - mutex_lock(&root->fs_info->cleaner_mutex); - btrfs_clean_old_snapshots(root); - mutex_unlock(&root->fs_info->cleaner_mutex); + + if (!(root->fs_info->sb->s_flags & MS_RDONLY) && + mutex_trylock(&root->fs_info->cleaner_mutex)) { + btrfs_clean_old_snapshots(root); + mutex_unlock(&root->fs_info->cleaner_mutex); + } if (freezing(current)) { refrigerator(); @@ -1558,15 +1584,36 @@ struct btrfs_root *open_ctree(struct super_block *sb, err = -ENOMEM; goto fail; } - INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS); + + ret = init_srcu_struct(&fs_info->subvol_srcu); + if (ret) { + err = ret; + goto fail; + } + + ret = setup_bdi(fs_info, &fs_info->bdi); + if (ret) { + err = ret; + goto fail_srcu; + } + + fs_info->btree_inode = new_inode(sb); + if (!fs_info->btree_inode) { + err = -ENOMEM; + goto fail_bdi; + } + + INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); INIT_LIST_HEAD(&fs_info->trans_list); INIT_LIST_HEAD(&fs_info->dead_roots); INIT_LIST_HEAD(&fs_info->hashers); INIT_LIST_HEAD(&fs_info->delalloc_inodes); INIT_LIST_HEAD(&fs_info->ordered_operations); + INIT_LIST_HEAD(&fs_info->caching_block_groups); spin_lock_init(&fs_info->delalloc_lock); spin_lock_init(&fs_info->new_trans_lock); spin_lock_init(&fs_info->ref_cache_lock); + spin_lock_init(&fs_info->fs_roots_radix_lock); init_completion(&fs_info->kobj_unregister); fs_info->tree_root = tree_root; @@ -1585,12 +1632,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->sb = sb; fs_info->max_extent = (u64)-1; fs_info->max_inline = 8192 * 1024; - if (setup_bdi(fs_info, &fs_info->bdi)) - goto fail_bdi; - fs_info->btree_inode = new_inode(sb); - fs_info->btree_inode->i_ino = 1; - fs_info->btree_inode->i_nlink = 1; - fs_info->metadata_ratio = 8; + fs_info->metadata_ratio = 0; fs_info->thread_pool_size = min_t(unsigned long, num_online_cpus() + 2, 8); @@ -1602,6 +1644,8 @@ struct btrfs_root *open_ctree(struct super_block *sb, sb->s_blocksize_bits = blksize_bits(4096); sb->s_bdi = &fs_info->bdi; + fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID; + fs_info->btree_inode->i_nlink = 1; /* * we set the i_size on the btree inode to the max possible int. * the real end of the address space is determined by all of @@ -1620,28 +1664,32 @@ struct btrfs_root *open_ctree(struct super_block *sb, BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops; + BTRFS_I(fs_info->btree_inode)->root = tree_root; + memset(&BTRFS_I(fs_info->btree_inode)->location, 0, + sizeof(struct btrfs_key)); + BTRFS_I(fs_info->btree_inode)->dummy_inode = 1; + insert_inode_hash(fs_info->btree_inode); + spin_lock_init(&fs_info->block_group_cache_lock); fs_info->block_group_cache_tree.rb_node = NULL; - extent_io_tree_init(&fs_info->pinned_extents, + extent_io_tree_init(&fs_info->freed_extents[0], fs_info->btree_inode->i_mapping, GFP_NOFS); + extent_io_tree_init(&fs_info->freed_extents[1], + fs_info->btree_inode->i_mapping, GFP_NOFS); + fs_info->pinned_extents = &fs_info->freed_extents[0]; fs_info->do_barriers = 1; - BTRFS_I(fs_info->btree_inode)->root = tree_root; - memset(&BTRFS_I(fs_info->btree_inode)->location, 0, - sizeof(struct btrfs_key)); - insert_inode_hash(fs_info->btree_inode); mutex_init(&fs_info->trans_mutex); mutex_init(&fs_info->ordered_operations_mutex); mutex_init(&fs_info->tree_log_mutex); - mutex_init(&fs_info->drop_mutex); mutex_init(&fs_info->chunk_mutex); mutex_init(&fs_info->transaction_kthread_mutex); mutex_init(&fs_info->cleaner_mutex); mutex_init(&fs_info->volume_mutex); - mutex_init(&fs_info->tree_reloc_mutex); init_rwsem(&fs_info->extent_commit_sem); + init_rwsem(&fs_info->subvol_sem); btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); btrfs_init_free_cluster(&fs_info->data_alloc_cluster); @@ -1701,20 +1749,24 @@ struct btrfs_root *open_ctree(struct super_block *sb, goto fail_iput; } - /* - * we need to start all the end_io workers up front because the - * queue work function gets called at interrupt time, and so it - * cannot dynamically grow. - */ + btrfs_init_workers(&fs_info->generic_worker, + "genwork", 1, NULL); + btrfs_init_workers(&fs_info->workers, "worker", - fs_info->thread_pool_size); + fs_info->thread_pool_size, + &fs_info->generic_worker); btrfs_init_workers(&fs_info->delalloc_workers, "delalloc", - fs_info->thread_pool_size); + fs_info->thread_pool_size, + &fs_info->generic_worker); btrfs_init_workers(&fs_info->submit_workers, "submit", min_t(u64, fs_devices->num_devices, - fs_info->thread_pool_size)); + fs_info->thread_pool_size), + &fs_info->generic_worker); + btrfs_init_workers(&fs_info->enospc_workers, "enospc", + fs_info->thread_pool_size, + &fs_info->generic_worker); /* a higher idle thresh on the submit workers makes it much more * likely that bios will be send down in a sane order to the @@ -1728,15 +1780,20 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->delalloc_workers.idle_thresh = 2; fs_info->delalloc_workers.ordered = 1; - btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1); + btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1, + &fs_info->generic_worker); btrfs_init_workers(&fs_info->endio_workers, "endio", - fs_info->thread_pool_size); + fs_info->thread_pool_size, + &fs_info->generic_worker); btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta", - fs_info->thread_pool_size); + fs_info->thread_pool_size, + &fs_info->generic_worker); btrfs_init_workers(&fs_info->endio_meta_write_workers, - "endio-meta-write", fs_info->thread_pool_size); + "endio-meta-write", fs_info->thread_pool_size, + &fs_info->generic_worker); btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", - fs_info->thread_pool_size); + fs_info->thread_pool_size, + &fs_info->generic_worker); /* * endios are largely parallel and should have a very @@ -1745,20 +1802,19 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->endio_workers.idle_thresh = 4; fs_info->endio_meta_workers.idle_thresh = 4; - fs_info->endio_write_workers.idle_thresh = 64; - fs_info->endio_meta_write_workers.idle_thresh = 64; + fs_info->endio_write_workers.idle_thresh = 2; + fs_info->endio_meta_write_workers.idle_thresh = 2; btrfs_start_workers(&fs_info->workers, 1); + btrfs_start_workers(&fs_info->generic_worker, 1); btrfs_start_workers(&fs_info->submit_workers, 1); btrfs_start_workers(&fs_info->delalloc_workers, 1); btrfs_start_workers(&fs_info->fixup_workers, 1); - btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size); - btrfs_start_workers(&fs_info->endio_meta_workers, - fs_info->thread_pool_size); - btrfs_start_workers(&fs_info->endio_meta_write_workers, - fs_info->thread_pool_size); - btrfs_start_workers(&fs_info->endio_write_workers, - fs_info->thread_pool_size); + btrfs_start_workers(&fs_info->endio_workers, 1); + btrfs_start_workers(&fs_info->endio_meta_workers, 1); + btrfs_start_workers(&fs_info->endio_meta_write_workers, 1); + btrfs_start_workers(&fs_info->endio_write_workers, 1); + btrfs_start_workers(&fs_info->enospc_workers, 1); fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, @@ -1918,6 +1974,9 @@ struct btrfs_root *open_ctree(struct super_block *sb, } } + ret = btrfs_find_orphan_roots(tree_root); + BUG_ON(ret); + if (!(sb->s_flags & MS_RDONLY)) { ret = btrfs_recover_relocation(tree_root); BUG_ON(ret); @@ -1961,6 +2020,7 @@ fail_chunk_root: free_extent_buffer(chunk_root->node); free_extent_buffer(chunk_root->commit_root); fail_sb_buffer: + btrfs_stop_workers(&fs_info->generic_worker); btrfs_stop_workers(&fs_info->fixup_workers); btrfs_stop_workers(&fs_info->delalloc_workers); btrfs_stop_workers(&fs_info->workers); @@ -1969,6 +2029,7 @@ fail_sb_buffer: btrfs_stop_workers(&fs_info->endio_meta_write_workers); btrfs_stop_workers(&fs_info->endio_write_workers); btrfs_stop_workers(&fs_info->submit_workers); + btrfs_stop_workers(&fs_info->enospc_workers); fail_iput: invalidate_inode_pages2(fs_info->btree_inode->i_mapping); iput(fs_info->btree_inode); @@ -1977,6 +2038,8 @@ fail_iput: btrfs_mapping_tree_free(&fs_info->mapping_tree); fail_bdi: bdi_destroy(&fs_info->bdi); +fail_srcu: + cleanup_srcu_struct(&fs_info->subvol_srcu); fail: kfree(extent_root); kfree(tree_root); @@ -2236,20 +2299,29 @@ int write_ctree_super(struct btrfs_trans_handle *trans, int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) { - WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); + spin_lock(&fs_info->fs_roots_radix_lock); radix_tree_delete(&fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid); + spin_unlock(&fs_info->fs_roots_radix_lock); + + if (btrfs_root_refs(&root->root_item) == 0) + synchronize_srcu(&fs_info->subvol_srcu); + + free_fs_root(root); + return 0; +} + +static void free_fs_root(struct btrfs_root *root) +{ + WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); if (root->anon_super.s_dev) { down_write(&root->anon_super.s_umount); kill_anon_super(&root->anon_super); } - if (root->node) - free_extent_buffer(root->node); - if (root->commit_root) - free_extent_buffer(root->commit_root); + free_extent_buffer(root->node); + free_extent_buffer(root->commit_root); kfree(root->name); kfree(root); - return 0; } static int del_fs_roots(struct btrfs_fs_info *fs_info) @@ -2258,6 +2330,20 @@ static int del_fs_roots(struct btrfs_fs_info *fs_info) struct btrfs_root *gang[8]; int i; + while (!list_empty(&fs_info->dead_roots)) { + gang[0] = list_entry(fs_info->dead_roots.next, + struct btrfs_root, root_list); + list_del(&gang[0]->root_list); + + if (gang[0]->in_radix) { + btrfs_free_fs_root(fs_info, gang[0]); + } else { + free_extent_buffer(gang[0]->node); + free_extent_buffer(gang[0]->commit_root); + kfree(gang[0]); + } + } + while (1) { ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, (void **)gang, 0, @@ -2287,9 +2373,6 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) root_objectid = gang[ret - 1]->root_key.objectid + 1; for (i = 0; i < ret; i++) { root_objectid = gang[i]->root_key.objectid; - ret = btrfs_find_dead_roots(fs_info->tree_root, - root_objectid); - BUG_ON(ret); btrfs_orphan_cleanup(gang[i]); } root_objectid++; @@ -2359,12 +2442,12 @@ int close_ctree(struct btrfs_root *root) free_extent_buffer(root->fs_info->csum_root->commit_root); btrfs_free_block_groups(root->fs_info); - btrfs_free_pinned_extents(root->fs_info); del_fs_roots(fs_info); iput(fs_info->btree_inode); + btrfs_stop_workers(&fs_info->generic_worker); btrfs_stop_workers(&fs_info->fixup_workers); btrfs_stop_workers(&fs_info->delalloc_workers); btrfs_stop_workers(&fs_info->workers); @@ -2373,11 +2456,13 @@ int close_ctree(struct btrfs_root *root) btrfs_stop_workers(&fs_info->endio_meta_write_workers); btrfs_stop_workers(&fs_info->endio_write_workers); btrfs_stop_workers(&fs_info->submit_workers); + btrfs_stop_workers(&fs_info->enospc_workers); btrfs_close_devices(fs_info->fs_devices); btrfs_mapping_tree_free(&fs_info->mapping_tree); bdi_destroy(&fs_info->bdi); + cleanup_srcu_struct(&fs_info->subvol_srcu); kfree(fs_info->extent_root); kfree(fs_info->tree_root); diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 9596b40..ba5c3fd 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -28,7 +28,7 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len, len = BTRFS_FID_SIZE_NON_CONNECTABLE; type = FILEID_BTRFS_WITHOUT_PARENT; - fid->objectid = BTRFS_I(inode)->location.objectid; + fid->objectid = inode->i_ino; fid->root_objectid = BTRFS_I(inode)->root->objectid; fid->gen = inode->i_generation; @@ -60,34 +60,61 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len, } static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, - u64 root_objectid, u32 generation) + u64 root_objectid, u32 generation, + int check_generation) { + struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info; struct btrfs_root *root; + struct dentry *dentry; struct inode *inode; struct btrfs_key key; + int index; + int err = 0; + + if (objectid < BTRFS_FIRST_FREE_OBJECTID) + return ERR_PTR(-ESTALE); key.objectid = root_objectid; btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); key.offset = (u64)-1; - root = btrfs_read_fs_root_no_name(btrfs_sb(sb)->fs_info, &key); - if (IS_ERR(root)) - return ERR_CAST(root); + index = srcu_read_lock(&fs_info->subvol_srcu); + + root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(root)) { + err = PTR_ERR(root); + goto fail; + } + + if (btrfs_root_refs(&root->root_item) == 0) { + err = -ENOENT; + goto fail; + } key.objectid = objectid; btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); key.offset = 0; inode = btrfs_iget(sb, &key, root); - if (IS_ERR(inode)) - return (void *)inode; + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto fail; + } + + srcu_read_unlock(&fs_info->subvol_srcu, index); - if (generation != inode->i_generation) { + if (check_generation && generation != inode->i_generation) { iput(inode); return ERR_PTR(-ESTALE); } - return d_obtain_alias(inode); + dentry = d_obtain_alias(inode); + if (!IS_ERR(dentry)) + dentry->d_op = &btrfs_dentry_operations; + return dentry; +fail: + srcu_read_unlock(&fs_info->subvol_srcu, index); + return ERR_PTR(err); } static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh, @@ -111,7 +138,7 @@ static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh, objectid = fid->parent_objectid; generation = fid->parent_gen; - return btrfs_get_dentry(sb, objectid, root_objectid, generation); + return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1); } static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh, @@ -133,66 +160,76 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh, root_objectid = fid->root_objectid; generation = fid->gen; - return btrfs_get_dentry(sb, objectid, root_objectid, generation); + return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1); } static struct dentry *btrfs_get_parent(struct dentry *child) { struct inode *dir = child->d_inode; + static struct dentry *dentry; struct btrfs_root *root = BTRFS_I(dir)->root; - struct btrfs_key key; struct btrfs_path *path; struct extent_buffer *leaf; - int slot; - u64 objectid; + struct btrfs_root_ref *ref; + struct btrfs_key key; + struct btrfs_key found_key; int ret; path = btrfs_alloc_path(); - key.objectid = dir->i_ino; - btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY); - key.offset = (u64)-1; + if (dir->i_ino == BTRFS_FIRST_FREE_OBJECTID) { + key.objectid = root->root_key.objectid; + key.type = BTRFS_ROOT_BACKREF_KEY; + key.offset = (u64)-1; + root = root->fs_info->tree_root; + } else { + key.objectid = dir->i_ino; + key.type = BTRFS_INODE_REF_KEY; + key.offset = (u64)-1; + } ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) { - /* Error */ - btrfs_free_path(path); - return ERR_PTR(ret); + if (ret < 0) + goto fail; + + BUG_ON(ret == 0); + if (path->slots[0] == 0) { + ret = -ENOENT; + goto fail; } + + path->slots[0]--; leaf = path->nodes[0]; - slot = path->slots[0]; - if (ret) { - /* btrfs_search_slot() returns the slot where we'd want to - insert a backref for parent inode #0xFFFFFFFFFFFFFFFF. - The _real_ backref, telling us what the parent inode - _actually_ is, will be in the slot _before_ the one - that btrfs_search_slot() returns. */ - if (!slot) { - /* Unless there is _no_ key in the tree before... */ - btrfs_free_path(path); - return ERR_PTR(-EIO); - } - slot--; + + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.objectid != key.objectid || found_key.type != key.type) { + ret = -ENOENT; + goto fail; } - btrfs_item_key_to_cpu(leaf, &key, slot); + if (found_key.type == BTRFS_ROOT_BACKREF_KEY) { + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_root_ref); + key.objectid = btrfs_root_ref_dirid(leaf, ref); + } else { + key.objectid = found_key.offset; + } btrfs_free_path(path); - if (key.objectid != dir->i_ino || key.type != BTRFS_INODE_REF_KEY) - return ERR_PTR(-EINVAL); - - objectid = key.offset; - - /* If we are already at the root of a subvol, return the real root */ - if (objectid == dir->i_ino) - return dget(dir->i_sb->s_root); + if (found_key.type == BTRFS_ROOT_BACKREF_KEY) { + return btrfs_get_dentry(root->fs_info->sb, key.objectid, + found_key.offset, 0, 0); + } - /* Build a new key for the inode item */ - key.objectid = objectid; - btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - - return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root)); + dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root)); + if (!IS_ERR(dentry)) + dentry->d_op = &btrfs_dentry_operations; + return dentry; +fail: + btrfs_free_path(path); + return ERR_PTR(ret); } const struct export_operations btrfs_export_ops = { diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 535f85b..e238a0c 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -32,12 +32,12 @@ #include "locking.h" #include "free-space-cache.h" -static int update_reserved_extents(struct btrfs_root *root, - u64 bytenr, u64 num, int reserve); static int update_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, int alloc, int mark_free); +static int update_reserved_extents(struct btrfs_block_group_cache *cache, + u64 num_bytes, int reserve); static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, @@ -57,10 +57,19 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, u64 parent, u64 root_objectid, u64 flags, struct btrfs_disk_key *key, int level, struct btrfs_key *ins); - static int do_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 alloc_bytes, u64 flags, int force); +static int pin_down_bytes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, u64 num_bytes, + int is_data, int reserved, + struct extent_buffer **must_clean); +static int find_next_key(struct btrfs_path *path, int level, + struct btrfs_key *key); +static void dump_space_info(struct btrfs_space_info *info, u64 bytes, + int dump_block_groups); static noinline int block_group_cache_done(struct btrfs_block_group_cache *cache) @@ -153,34 +162,34 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, return ret; } -/* - * We always set EXTENT_LOCKED for the super mirror extents so we don't - * overwrite them, so those bits need to be unset. Also, if we are unmounting - * with pinned extents still sitting there because we had a block group caching, - * we need to clear those now, since we are done. - */ -void btrfs_free_pinned_extents(struct btrfs_fs_info *info) +static int add_excluded_extent(struct btrfs_root *root, + u64 start, u64 num_bytes) { - u64 start, end, last = 0; - int ret; + u64 end = start + num_bytes - 1; + set_extent_bits(&root->fs_info->freed_extents[0], + start, end, EXTENT_UPTODATE, GFP_NOFS); + set_extent_bits(&root->fs_info->freed_extents[1], + start, end, EXTENT_UPTODATE, GFP_NOFS); + return 0; +} - while (1) { - ret = find_first_extent_bit(&info->pinned_extents, last, - &start, &end, - EXTENT_LOCKED|EXTENT_DIRTY); - if (ret) - break; +static void free_excluded_extents(struct btrfs_root *root, + struct btrfs_block_group_cache *cache) +{ + u64 start, end; - clear_extent_bits(&info->pinned_extents, start, end, - EXTENT_LOCKED|EXTENT_DIRTY, GFP_NOFS); - last = end+1; - } + start = cache->key.objectid; + end = start + cache->key.offset - 1; + + clear_extent_bits(&root->fs_info->freed_extents[0], + start, end, EXTENT_UPTODATE, GFP_NOFS); + clear_extent_bits(&root->fs_info->freed_extents[1], + start, end, EXTENT_UPTODATE, GFP_NOFS); } -static int remove_sb_from_cache(struct btrfs_root *root, - struct btrfs_block_group_cache *cache) +static int exclude_super_stripes(struct btrfs_root *root, + struct btrfs_block_group_cache *cache) { - struct btrfs_fs_info *fs_info = root->fs_info; u64 bytenr; u64 *logical; int stripe_len; @@ -192,17 +201,42 @@ static int remove_sb_from_cache(struct btrfs_root *root, cache->key.objectid, bytenr, 0, &logical, &nr, &stripe_len); BUG_ON(ret); + while (nr--) { - try_lock_extent(&fs_info->pinned_extents, - logical[nr], - logical[nr] + stripe_len - 1, GFP_NOFS); + cache->bytes_super += stripe_len; + ret = add_excluded_extent(root, logical[nr], + stripe_len); + BUG_ON(ret); } + kfree(logical); } - return 0; } +static struct btrfs_caching_control * +get_caching_control(struct btrfs_block_group_cache *cache) +{ + struct btrfs_caching_control *ctl; + + spin_lock(&cache->lock); + if (cache->cached != BTRFS_CACHE_STARTED) { + spin_unlock(&cache->lock); + return NULL; + } + + ctl = cache->caching_ctl; + atomic_inc(&ctl->count); + spin_unlock(&cache->lock); + return ctl; +} + +static void put_caching_control(struct btrfs_caching_control *ctl) +{ + if (atomic_dec_and_test(&ctl->count)) + kfree(ctl); +} + /* * this is only called by cache_block_group, since we could have freed extents * we need to check the pinned_extents for any extents that can't be used yet @@ -215,9 +249,9 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, int ret; while (start < end) { - ret = find_first_extent_bit(&info->pinned_extents, start, + ret = find_first_extent_bit(info->pinned_extents, start, &extent_start, &extent_end, - EXTENT_DIRTY|EXTENT_LOCKED); + EXTENT_DIRTY | EXTENT_UPTODATE); if (ret) break; @@ -249,22 +283,27 @@ static int caching_kthread(void *data) { struct btrfs_block_group_cache *block_group = data; struct btrfs_fs_info *fs_info = block_group->fs_info; - u64 last = 0; + struct btrfs_caching_control *caching_ctl = block_group->caching_ctl; + struct btrfs_root *extent_root = fs_info->extent_root; struct btrfs_path *path; - int ret = 0; - struct btrfs_key key; struct extent_buffer *leaf; - int slot; + struct btrfs_key key; u64 total_found = 0; - - BUG_ON(!fs_info); + u64 last = 0; + u32 nritems; + int ret = 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; - atomic_inc(&block_group->space_info->caching_threads); + exclude_super_stripes(extent_root, block_group); + spin_lock(&block_group->space_info->lock); + block_group->space_info->bytes_super += block_group->bytes_super; + spin_unlock(&block_group->space_info->lock); + last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); + /* * We don't want to deadlock with somebody trying to allocate a new * extent for the extent root while also trying to search the extent @@ -277,74 +316,64 @@ static int caching_kthread(void *data) key.objectid = last; key.offset = 0; - btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); + key.type = BTRFS_EXTENT_ITEM_KEY; again: + mutex_lock(&caching_ctl->mutex); /* need to make sure the commit_root doesn't disappear */ down_read(&fs_info->extent_commit_sem); - ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); + ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) goto err; + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + while (1) { smp_mb(); - if (block_group->fs_info->closing > 1) { + if (fs_info->closing > 1) { last = (u64)-1; break; } - leaf = path->nodes[0]; - slot = path->slots[0]; - if (slot >= btrfs_header_nritems(leaf)) { - ret = btrfs_next_leaf(fs_info->extent_root, path); - if (ret < 0) - goto err; - else if (ret) + if (path->slots[0] < nritems) { + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + } else { + ret = find_next_key(path, 0, &key); + if (ret) break; - if (need_resched() || - btrfs_transaction_in_commit(fs_info)) { - leaf = path->nodes[0]; - - /* this shouldn't happen, but if the - * leaf is empty just move on. - */ - if (btrfs_header_nritems(leaf) == 0) - break; - /* - * we need to copy the key out so that - * we are sure the next search advances - * us forward in the btree. - */ - btrfs_item_key_to_cpu(leaf, &key, 0); - btrfs_release_path(fs_info->extent_root, path); - up_read(&fs_info->extent_commit_sem); + caching_ctl->progress = last; + btrfs_release_path(extent_root, path); + up_read(&fs_info->extent_commit_sem); + mutex_unlock(&caching_ctl->mutex); + if (btrfs_transaction_in_commit(fs_info)) schedule_timeout(1); - goto again; - } + else + cond_resched(); + goto again; + } + if (key.objectid < block_group->key.objectid) { + path->slots[0]++; continue; } - btrfs_item_key_to_cpu(leaf, &key, slot); - if (key.objectid < block_group->key.objectid) - goto next; if (key.objectid >= block_group->key.objectid + block_group->key.offset) break; - if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) { + if (key.type == BTRFS_EXTENT_ITEM_KEY) { total_found += add_new_free_space(block_group, fs_info, last, key.objectid); last = key.objectid + key.offset; - } - if (total_found > (1024 * 1024 * 2)) { - total_found = 0; - wake_up(&block_group->caching_q); + if (total_found > (1024 * 1024 * 2)) { + total_found = 0; + wake_up(&caching_ctl->wait); + } } -next: path->slots[0]++; } ret = 0; @@ -352,33 +381,65 @@ next: total_found += add_new_free_space(block_group, fs_info, last, block_group->key.objectid + block_group->key.offset); + caching_ctl->progress = (u64)-1; spin_lock(&block_group->lock); + block_group->caching_ctl = NULL; block_group->cached = BTRFS_CACHE_FINISHED; spin_unlock(&block_group->lock); err: btrfs_free_path(path); up_read(&fs_info->extent_commit_sem); - atomic_dec(&block_group->space_info->caching_threads); - wake_up(&block_group->caching_q); + free_excluded_extents(extent_root, block_group); + + mutex_unlock(&caching_ctl->mutex); + wake_up(&caching_ctl->wait); + + put_caching_control(caching_ctl); + atomic_dec(&block_group->space_info->caching_threads); return 0; } static int cache_block_group(struct btrfs_block_group_cache *cache) { + struct btrfs_fs_info *fs_info = cache->fs_info; + struct btrfs_caching_control *caching_ctl; struct task_struct *tsk; int ret = 0; + smp_mb(); + if (cache->cached != BTRFS_CACHE_NO) + return 0; + + caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL); + BUG_ON(!caching_ctl); + + INIT_LIST_HEAD(&caching_ctl->list); + mutex_init(&caching_ctl->mutex); + init_waitqueue_head(&caching_ctl->wait); + caching_ctl->block_group = cache; + caching_ctl->progress = cache->key.objectid; + /* one for caching kthread, one for caching block group list */ + atomic_set(&caching_ctl->count, 2); + spin_lock(&cache->lock); if (cache->cached != BTRFS_CACHE_NO) { spin_unlock(&cache->lock); - return ret; + kfree(caching_ctl); + return 0; } + cache->caching_ctl = caching_ctl; cache->cached = BTRFS_CACHE_STARTED; spin_unlock(&cache->lock); + down_write(&fs_info->extent_commit_sem); + list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); + up_write(&fs_info->extent_commit_sem); + + atomic_inc(&cache->space_info->caching_threads); + tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n", cache->key.objectid); if (IS_ERR(tsk)) { @@ -1507,23 +1568,23 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans, return ret; } -#ifdef BIO_RW_DISCARD static void btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len) { blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, DISCARD_FL_BARRIER); } -#endif static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, u64 num_bytes) { -#ifdef BIO_RW_DISCARD int ret; u64 map_length = num_bytes; struct btrfs_multi_bio *multi = NULL; + if (!btrfs_test_opt(root, DISCARD)) + return 0; + /* Tell the block device(s) that the sectors can be discarded */ ret = btrfs_map_block(&root->fs_info->mapping_tree, READ, bytenr, &map_length, &multi, 0); @@ -1543,9 +1604,6 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, } return ret; -#else - return 0; -#endif } int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, @@ -1657,7 +1715,6 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans, parent, ref_root, flags, ref->objectid, ref->offset, &ins, node->ref_mod); - update_reserved_extents(root, ins.objectid, ins.offset, 0); } else if (node->action == BTRFS_ADD_DELAYED_REF) { ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, node->num_bytes, parent, @@ -1783,7 +1840,6 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, extent_op->flags_to_set, &extent_op->key, ref->level, &ins); - update_reserved_extents(root, ins.objectid, ins.offset, 0); } else if (node->action == BTRFS_ADD_DELAYED_REF) { ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, node->num_bytes, parent, ref_root, @@ -1818,16 +1874,32 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, BUG_ON(extent_op); head = btrfs_delayed_node_to_head(node); if (insert_reserved) { + int mark_free = 0; + struct extent_buffer *must_clean = NULL; + + ret = pin_down_bytes(trans, root, NULL, + node->bytenr, node->num_bytes, + head->is_data, 1, &must_clean); + if (ret > 0) + mark_free = 1; + + if (must_clean) { + clean_tree_block(NULL, root, must_clean); + btrfs_tree_unlock(must_clean); + free_extent_buffer(must_clean); + } if (head->is_data) { ret = btrfs_del_csums(trans, root, node->bytenr, node->num_bytes); BUG_ON(ret); } - btrfs_update_pinned_extents(root, node->bytenr, - node->num_bytes, 1); - update_reserved_extents(root, node->bytenr, - node->num_bytes, 0); + if (mark_free) { + ret = btrfs_free_reserved_extent(root, + node->bytenr, + node->num_bytes); + BUG_ON(ret); + } } mutex_unlock(&head->mutex); return 0; @@ -2692,60 +2764,448 @@ void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode) alloc_target); } +static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items) +{ + u64 num_bytes; + int level; + + level = BTRFS_MAX_LEVEL - 2; + /* + * NOTE: these calculations are absolutely the worst possible case. + * This assumes that _every_ item we insert will require a new leaf, and + * that the tree has grown to its maximum level size. + */ + + /* + * for every item we insert we could insert both an extent item and a + * extent ref item. Then for ever item we insert, we will need to cow + * both the original leaf, plus the leaf to the left and right of it. + * + * Unless we are talking about the extent root, then we just want the + * number of items * 2, since we just need the extent item plus its ref. + */ + if (root == root->fs_info->extent_root) + num_bytes = num_items * 2; + else + num_bytes = (num_items + (2 * num_items)) * 3; + + /* + * num_bytes is total number of leaves we could need times the leaf + * size, and then for every leaf we could end up cow'ing 2 nodes per + * level, down to the leaf level. + */ + num_bytes = (num_bytes * root->leafsize) + + (num_bytes * (level * 2)) * root->nodesize; + + return num_bytes; +} + /* - * for now this just makes sure we have at least 5% of our metadata space free - * for use. + * Unreserve metadata space for delalloc. If we have less reserved credits than + * we have extents, this function does nothing. */ -int btrfs_check_metadata_free_space(struct btrfs_root *root) +int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root, + struct inode *inode, int num_items) { struct btrfs_fs_info *info = root->fs_info; struct btrfs_space_info *meta_sinfo; - u64 alloc_target, thresh; - int committed = 0, ret; + u64 num_bytes; + u64 alloc_target; + bool bug = false; /* get the space info for where the metadata will live */ alloc_target = btrfs_get_alloc_profile(root, 0); meta_sinfo = __find_space_info(info, alloc_target); -again: + num_bytes = calculate_bytes_needed(root->fs_info->extent_root, + num_items); + spin_lock(&meta_sinfo->lock); - if (!meta_sinfo->full) - thresh = meta_sinfo->total_bytes * 80; - else - thresh = meta_sinfo->total_bytes * 95; + spin_lock(&BTRFS_I(inode)->accounting_lock); + if (BTRFS_I(inode)->reserved_extents <= + BTRFS_I(inode)->outstanding_extents) { + spin_unlock(&BTRFS_I(inode)->accounting_lock); + spin_unlock(&meta_sinfo->lock); + return 0; + } + spin_unlock(&BTRFS_I(inode)->accounting_lock); + + BTRFS_I(inode)->reserved_extents--; + BUG_ON(BTRFS_I(inode)->reserved_extents < 0); + + if (meta_sinfo->bytes_delalloc < num_bytes) { + bug = true; + meta_sinfo->bytes_delalloc = 0; + } else { + meta_sinfo->bytes_delalloc -= num_bytes; + } + spin_unlock(&meta_sinfo->lock); + + BUG_ON(bug); + + return 0; +} + +static void check_force_delalloc(struct btrfs_space_info *meta_sinfo) +{ + u64 thresh; + + thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + + meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + + meta_sinfo->bytes_super + meta_sinfo->bytes_root + + meta_sinfo->bytes_may_use; + thresh = meta_sinfo->total_bytes - thresh; + thresh *= 80; do_div(thresh, 100); + if (thresh <= meta_sinfo->bytes_delalloc) + meta_sinfo->force_delalloc = 1; + else + meta_sinfo->force_delalloc = 0; +} - if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + - meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly > thresh) { - struct btrfs_trans_handle *trans; - if (!meta_sinfo->full) { - meta_sinfo->force_alloc = 1; - spin_unlock(&meta_sinfo->lock); +struct async_flush { + struct btrfs_root *root; + struct btrfs_space_info *info; + struct btrfs_work work; +}; - trans = btrfs_start_transaction(root, 1); - if (!trans) - return -ENOMEM; +static noinline void flush_delalloc_async(struct btrfs_work *work) +{ + struct async_flush *async; + struct btrfs_root *root; + struct btrfs_space_info *info; - ret = do_chunk_alloc(trans, root->fs_info->extent_root, - 2 * 1024 * 1024, alloc_target, 0); - btrfs_end_transaction(trans, root); + async = container_of(work, struct async_flush, work); + root = async->root; + info = async->info; + + btrfs_start_delalloc_inodes(root); + wake_up(&info->flush_wait); + btrfs_wait_ordered_extents(root, 0); + + spin_lock(&info->lock); + info->flushing = 0; + spin_unlock(&info->lock); + wake_up(&info->flush_wait); + + kfree(async); +} + +static void wait_on_flush(struct btrfs_space_info *info) +{ + DEFINE_WAIT(wait); + u64 used; + + while (1) { + prepare_to_wait(&info->flush_wait, &wait, + TASK_UNINTERRUPTIBLE); + spin_lock(&info->lock); + if (!info->flushing) { + spin_unlock(&info->lock); + break; + } + + used = info->bytes_used + info->bytes_reserved + + info->bytes_pinned + info->bytes_readonly + + info->bytes_super + info->bytes_root + + info->bytes_may_use + info->bytes_delalloc; + if (used < info->total_bytes) { + spin_unlock(&info->lock); + break; + } + spin_unlock(&info->lock); + schedule(); + } + finish_wait(&info->flush_wait, &wait); +} + +static void flush_delalloc(struct btrfs_root *root, + struct btrfs_space_info *info) +{ + struct async_flush *async; + bool wait = false; + + spin_lock(&info->lock); + + if (!info->flushing) { + info->flushing = 1; + init_waitqueue_head(&info->flush_wait); + } else { + wait = true; + } + + spin_unlock(&info->lock); + + if (wait) { + wait_on_flush(info); + return; + } + + async = kzalloc(sizeof(*async), GFP_NOFS); + if (!async) + goto flush; + + async->root = root; + async->info = info; + async->work.func = flush_delalloc_async; + + btrfs_queue_worker(&root->fs_info->enospc_workers, + &async->work); + wait_on_flush(info); + return; + +flush: + btrfs_start_delalloc_inodes(root); + btrfs_wait_ordered_extents(root, 0); + + spin_lock(&info->lock); + info->flushing = 0; + spin_unlock(&info->lock); + wake_up(&info->flush_wait); +} + +static int maybe_allocate_chunk(struct btrfs_root *root, + struct btrfs_space_info *info) +{ + struct btrfs_super_block *disk_super = &root->fs_info->super_copy; + struct btrfs_trans_handle *trans; + bool wait = false; + int ret = 0; + u64 min_metadata; + u64 free_space; + + free_space = btrfs_super_total_bytes(disk_super); + /* + * we allow the metadata to grow to a max of either 5gb or 5% of the + * space in the volume. + */ + min_metadata = min((u64)5 * 1024 * 1024 * 1024, + div64_u64(free_space * 5, 100)); + if (info->total_bytes >= min_metadata) { + spin_unlock(&info->lock); + return 0; + } + + if (info->full) { + spin_unlock(&info->lock); + return 0; + } + + if (!info->allocating_chunk) { + info->force_alloc = 1; + info->allocating_chunk = 1; + init_waitqueue_head(&info->allocate_wait); + } else { + wait = true; + } + + spin_unlock(&info->lock); + + if (wait) { + wait_event(info->allocate_wait, + !info->allocating_chunk); + return 1; + } + + trans = btrfs_start_transaction(root, 1); + if (!trans) { + ret = -ENOMEM; + goto out; + } + + ret = do_chunk_alloc(trans, root->fs_info->extent_root, + 4096 + 2 * 1024 * 1024, + info->flags, 0); + btrfs_end_transaction(trans, root); + if (ret) + goto out; +out: + spin_lock(&info->lock); + info->allocating_chunk = 0; + spin_unlock(&info->lock); + wake_up(&info->allocate_wait); + + if (ret) + return 0; + return 1; +} + +/* + * Reserve metadata space for delalloc. + */ +int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root, + struct inode *inode, int num_items) +{ + struct btrfs_fs_info *info = root->fs_info; + struct btrfs_space_info *meta_sinfo; + u64 num_bytes; + u64 used; + u64 alloc_target; + int flushed = 0; + int force_delalloc; + + /* get the space info for where the metadata will live */ + alloc_target = btrfs_get_alloc_profile(root, 0); + meta_sinfo = __find_space_info(info, alloc_target); + + num_bytes = calculate_bytes_needed(root->fs_info->extent_root, + num_items); +again: + spin_lock(&meta_sinfo->lock); + + force_delalloc = meta_sinfo->force_delalloc; + + if (unlikely(!meta_sinfo->bytes_root)) + meta_sinfo->bytes_root = calculate_bytes_needed(root, 6); + + if (!flushed) + meta_sinfo->bytes_delalloc += num_bytes; + + used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + + meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + + meta_sinfo->bytes_super + meta_sinfo->bytes_root + + meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc; + + if (used > meta_sinfo->total_bytes) { + flushed++; + + if (flushed == 1) { + if (maybe_allocate_chunk(root, meta_sinfo)) + goto again; + flushed++; + } else { + spin_unlock(&meta_sinfo->lock); + } + + if (flushed == 2) { + filemap_flush(inode->i_mapping); + goto again; + } else if (flushed == 3) { + flush_delalloc(root, meta_sinfo); goto again; } + spin_lock(&meta_sinfo->lock); + meta_sinfo->bytes_delalloc -= num_bytes; spin_unlock(&meta_sinfo->lock); + printk(KERN_ERR "enospc, has %d, reserved %d\n", + BTRFS_I(inode)->outstanding_extents, + BTRFS_I(inode)->reserved_extents); + dump_space_info(meta_sinfo, 0, 0); + return -ENOSPC; + } - if (!committed) { - committed = 1; - trans = btrfs_join_transaction(root, 1); - if (!trans) - return -ENOMEM; - ret = btrfs_commit_transaction(trans, root); - if (ret) - return ret; + BTRFS_I(inode)->reserved_extents++; + check_force_delalloc(meta_sinfo); + spin_unlock(&meta_sinfo->lock); + + if (!flushed && force_delalloc) + filemap_flush(inode->i_mapping); + + return 0; +} + +/* + * unreserve num_items number of items worth of metadata space. This needs to + * be paired with btrfs_reserve_metadata_space. + * + * NOTE: if you have the option, run this _AFTER_ you do a + * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref + * oprations which will result in more used metadata, so we want to make sure we + * can do that without issue. + */ +int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items) +{ + struct btrfs_fs_info *info = root->fs_info; + struct btrfs_space_info *meta_sinfo; + u64 num_bytes; + u64 alloc_target; + bool bug = false; + + /* get the space info for where the metadata will live */ + alloc_target = btrfs_get_alloc_profile(root, 0); + meta_sinfo = __find_space_info(info, alloc_target); + + num_bytes = calculate_bytes_needed(root, num_items); + + spin_lock(&meta_sinfo->lock); + if (meta_sinfo->bytes_may_use < num_bytes) { + bug = true; + meta_sinfo->bytes_may_use = 0; + } else { + meta_sinfo->bytes_may_use -= num_bytes; + } + spin_unlock(&meta_sinfo->lock); + + BUG_ON(bug); + + return 0; +} + +/* + * Reserve some metadata space for use. We'll calculate the worste case number + * of bytes that would be needed to modify num_items number of items. If we + * have space, fantastic, if not, you get -ENOSPC. Please call + * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of + * items you reserved, since whatever metadata you needed should have already + * been allocated. + * + * This will commit the transaction to make more space if we don't have enough + * metadata space. THe only time we don't do this is if we're reserving space + * inside of a transaction, then we will just return -ENOSPC and it is the + * callers responsibility to handle it properly. + */ +int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items) +{ + struct btrfs_fs_info *info = root->fs_info; + struct btrfs_space_info *meta_sinfo; + u64 num_bytes; + u64 used; + u64 alloc_target; + int retries = 0; + + /* get the space info for where the metadata will live */ + alloc_target = btrfs_get_alloc_profile(root, 0); + meta_sinfo = __find_space_info(info, alloc_target); + + num_bytes = calculate_bytes_needed(root, num_items); +again: + spin_lock(&meta_sinfo->lock); + + if (unlikely(!meta_sinfo->bytes_root)) + meta_sinfo->bytes_root = calculate_bytes_needed(root, 6); + + if (!retries) + meta_sinfo->bytes_may_use += num_bytes; + + used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + + meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + + meta_sinfo->bytes_super + meta_sinfo->bytes_root + + meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc; + + if (used > meta_sinfo->total_bytes) { + retries++; + if (retries == 1) { + if (maybe_allocate_chunk(root, meta_sinfo)) + goto again; + retries++; + } else { + spin_unlock(&meta_sinfo->lock); + } + + if (retries == 2) { + flush_delalloc(root, meta_sinfo); goto again; } + spin_lock(&meta_sinfo->lock); + meta_sinfo->bytes_may_use -= num_bytes; + spin_unlock(&meta_sinfo->lock); + + dump_space_info(meta_sinfo, 0, 0); return -ENOSPC; } + + check_force_delalloc(meta_sinfo); spin_unlock(&meta_sinfo->lock); return 0; @@ -2765,13 +3225,16 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); data_sinfo = BTRFS_I(inode)->space_info; + if (!data_sinfo) + goto alloc; + again: /* make sure we have enough space to handle the data first */ spin_lock(&data_sinfo->lock); if (data_sinfo->total_bytes - data_sinfo->bytes_used - data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved - data_sinfo->bytes_pinned - data_sinfo->bytes_readonly - - data_sinfo->bytes_may_use < bytes) { + data_sinfo->bytes_may_use - data_sinfo->bytes_super < bytes) { struct btrfs_trans_handle *trans; /* @@ -2783,7 +3246,7 @@ again: data_sinfo->force_alloc = 1; spin_unlock(&data_sinfo->lock); - +alloc: alloc_target = btrfs_get_alloc_profile(root, 1); trans = btrfs_start_transaction(root, 1); if (!trans) @@ -2795,12 +3258,17 @@ again: btrfs_end_transaction(trans, root); if (ret) return ret; + + if (!data_sinfo) { + btrfs_set_inode_space_info(root, inode); + data_sinfo = BTRFS_I(inode)->space_info; + } goto again; } spin_unlock(&data_sinfo->lock); /* commit the current transaction and try again */ - if (!committed) { + if (!committed && !root->fs_info->open_ioctl_trans) { committed = 1; trans = btrfs_join_transaction(root, 1); if (!trans) @@ -2828,7 +3296,7 @@ again: BTRFS_I(inode)->reserved_bytes += bytes; spin_unlock(&data_sinfo->lock); - return btrfs_check_metadata_free_space(root); + return 0; } /* @@ -2927,17 +3395,15 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, BUG_ON(!space_info); spin_lock(&space_info->lock); - if (space_info->force_alloc) { + if (space_info->force_alloc) force = 1; - space_info->force_alloc = 0; - } if (space_info->full) { spin_unlock(&space_info->lock); goto out; } thresh = space_info->total_bytes - space_info->bytes_readonly; - thresh = div_factor(thresh, 6); + thresh = div_factor(thresh, 8); if (!force && (space_info->bytes_used + space_info->bytes_pinned + space_info->bytes_reserved + alloc_bytes) < thresh) { @@ -2951,7 +3417,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, * we keep a reasonable number of metadata chunks allocated in the * FS as well. */ - if (flags & BTRFS_BLOCK_GROUP_DATA) { + if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) { fs_info->data_chunk_allocations++; if (!(fs_info->data_chunk_allocations % fs_info->metadata_ratio)) @@ -2959,8 +3425,11 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, } ret = btrfs_alloc_chunk(trans, extent_root, flags); + spin_lock(&space_info->lock); if (ret) space_info->full = 1; + space_info->force_alloc = 0; + spin_unlock(&space_info->lock); out: mutex_unlock(&extent_root->fs_info->chunk_mutex); return ret; @@ -3009,10 +3478,12 @@ static int update_block_group(struct btrfs_trans_handle *trans, num_bytes = min(total, cache->key.offset - byte_in_group); if (alloc) { old_val += num_bytes; + btrfs_set_block_group_used(&cache->item, old_val); + cache->reserved -= num_bytes; cache->space_info->bytes_used += num_bytes; + cache->space_info->bytes_reserved -= num_bytes; if (cache->ro) cache->space_info->bytes_readonly -= num_bytes; - btrfs_set_block_group_used(&cache->item, old_val); spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); } else { @@ -3057,127 +3528,136 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start) return bytenr; } -int btrfs_update_pinned_extents(struct btrfs_root *root, - u64 bytenr, u64 num, int pin) +/* + * this function must be called within transaction + */ +int btrfs_pin_extent(struct btrfs_root *root, + u64 bytenr, u64 num_bytes, int reserved) { - u64 len; - struct btrfs_block_group_cache *cache; struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_block_group_cache *cache; - if (pin) - set_extent_dirty(&fs_info->pinned_extents, - bytenr, bytenr + num - 1, GFP_NOFS); - - while (num > 0) { - cache = btrfs_lookup_block_group(fs_info, bytenr); - BUG_ON(!cache); - len = min(num, cache->key.offset - - (bytenr - cache->key.objectid)); - if (pin) { - spin_lock(&cache->space_info->lock); - spin_lock(&cache->lock); - cache->pinned += len; - cache->space_info->bytes_pinned += len; - spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); - fs_info->total_pinned += len; - } else { - int unpin = 0; + cache = btrfs_lookup_block_group(fs_info, bytenr); + BUG_ON(!cache); - /* - * in order to not race with the block group caching, we - * only want to unpin the extent if we are cached. If - * we aren't cached, we want to start async caching this - * block group so we can free the extent the next time - * around. - */ - spin_lock(&cache->space_info->lock); - spin_lock(&cache->lock); - unpin = (cache->cached == BTRFS_CACHE_FINISHED); - if (likely(unpin)) { - cache->pinned -= len; - cache->space_info->bytes_pinned -= len; - fs_info->total_pinned -= len; - } - spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + cache->pinned += num_bytes; + cache->space_info->bytes_pinned += num_bytes; + if (reserved) { + cache->reserved -= num_bytes; + cache->space_info->bytes_reserved -= num_bytes; + } + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); - if (likely(unpin)) - clear_extent_dirty(&fs_info->pinned_extents, - bytenr, bytenr + len -1, - GFP_NOFS); - else - cache_block_group(cache); + btrfs_put_block_group(cache); - if (unpin) - btrfs_add_free_space(cache, bytenr, len); - } - btrfs_put_block_group(cache); - bytenr += len; - num -= len; + set_extent_dirty(fs_info->pinned_extents, + bytenr, bytenr + num_bytes - 1, GFP_NOFS); + return 0; +} + +static int update_reserved_extents(struct btrfs_block_group_cache *cache, + u64 num_bytes, int reserve) +{ + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + if (reserve) { + cache->reserved += num_bytes; + cache->space_info->bytes_reserved += num_bytes; + } else { + cache->reserved -= num_bytes; + cache->space_info->bytes_reserved -= num_bytes; } + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); return 0; } -static int update_reserved_extents(struct btrfs_root *root, - u64 bytenr, u64 num, int reserve) +int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, + struct btrfs_root *root) { - u64 len; - struct btrfs_block_group_cache *cache; struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_caching_control *next; + struct btrfs_caching_control *caching_ctl; + struct btrfs_block_group_cache *cache; - while (num > 0) { - cache = btrfs_lookup_block_group(fs_info, bytenr); - BUG_ON(!cache); - len = min(num, cache->key.offset - - (bytenr - cache->key.objectid)); + down_write(&fs_info->extent_commit_sem); - spin_lock(&cache->space_info->lock); - spin_lock(&cache->lock); - if (reserve) { - cache->reserved += len; - cache->space_info->bytes_reserved += len; + list_for_each_entry_safe(caching_ctl, next, + &fs_info->caching_block_groups, list) { + cache = caching_ctl->block_group; + if (block_group_cache_done(cache)) { + cache->last_byte_to_unpin = (u64)-1; + list_del_init(&caching_ctl->list); + put_caching_control(caching_ctl); } else { - cache->reserved -= len; - cache->space_info->bytes_reserved -= len; + cache->last_byte_to_unpin = caching_ctl->progress; } - spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); - btrfs_put_block_group(cache); - bytenr += len; - num -= len; } + + if (fs_info->pinned_extents == &fs_info->freed_extents[0]) + fs_info->pinned_extents = &fs_info->freed_extents[1]; + else + fs_info->pinned_extents = &fs_info->freed_extents[0]; + + up_write(&fs_info->extent_commit_sem); return 0; } -int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy) +static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) { - u64 last = 0; - u64 start; - u64 end; - struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents; - int ret; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_block_group_cache *cache = NULL; + u64 len; - while (1) { - ret = find_first_extent_bit(pinned_extents, last, - &start, &end, EXTENT_DIRTY); - if (ret) - break; + while (start <= end) { + if (!cache || + start >= cache->key.objectid + cache->key.offset) { + if (cache) + btrfs_put_block_group(cache); + cache = btrfs_lookup_block_group(fs_info, start); + BUG_ON(!cache); + } - set_extent_dirty(copy, start, end, GFP_NOFS); - last = end + 1; + len = cache->key.objectid + cache->key.offset - start; + len = min(len, end + 1 - start); + + if (start < cache->last_byte_to_unpin) { + len = min(len, cache->last_byte_to_unpin - start); + btrfs_add_free_space(cache, start, len); + } + + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + cache->pinned -= len; + cache->space_info->bytes_pinned -= len; + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + + start += len; } + + if (cache) + btrfs_put_block_group(cache); return 0; } int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_io_tree *unpin) + struct btrfs_root *root) { + struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_io_tree *unpin; u64 start; u64 end; int ret; + if (fs_info->pinned_extents == &fs_info->freed_extents[0]) + unpin = &fs_info->freed_extents[1]; + else + unpin = &fs_info->freed_extents[0]; + while (1) { ret = find_first_extent_bit(unpin, 0, &start, &end, EXTENT_DIRTY); @@ -3186,10 +3666,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, ret = btrfs_discard_extent(root, start, end + 1 - start); - /* unlocks the pinned mutex */ - btrfs_update_pinned_extents(root, start, end + 1 - start, 0); clear_extent_dirty(unpin, start, end, GFP_NOFS); - + unpin_extent_range(root, start, end); cond_resched(); } @@ -3199,7 +3677,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, static int pin_down_bytes(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - u64 bytenr, u64 num_bytes, int is_data, + u64 bytenr, u64 num_bytes, + int is_data, int reserved, struct extent_buffer **must_clean) { int err = 0; @@ -3208,6 +3687,14 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans, if (is_data) goto pinit; + /* + * discard is sloooow, and so triggering discards on + * individual btree blocks isn't a good plan. Just + * pin everything in discard mode. + */ + if (btrfs_test_opt(root, DISCARD)) + goto pinit; + buf = btrfs_find_tree_block(root, bytenr, num_bytes); if (!buf) goto pinit; @@ -3231,15 +3718,15 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans, } free_extent_buffer(buf); pinit: - btrfs_set_path_blocking(path); + if (path) + btrfs_set_path_blocking(path); /* unlocks the pinned mutex */ - btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); + btrfs_pin_extent(root, bytenr, num_bytes, reserved); BUG_ON(err < 0); return 0; } - static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, @@ -3413,7 +3900,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } ret = pin_down_bytes(trans, root, path, bytenr, - num_bytes, is_data, &must_clean); + num_bytes, is_data, 0, &must_clean); if (ret > 0) mark_free = 1; BUG_ON(ret < 0); @@ -3544,8 +4031,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, if (root_objectid == BTRFS_TREE_LOG_OBJECTID) { WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID); /* unlocks the pinned mutex */ - btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); - update_reserved_extents(root, bytenr, num_bytes, 0); + btrfs_pin_extent(root, bytenr, num_bytes, 1); ret = 0; } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes, @@ -3585,19 +4071,33 @@ static noinline int wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, u64 num_bytes) { + struct btrfs_caching_control *caching_ctl; DEFINE_WAIT(wait); - prepare_to_wait(&cache->caching_q, &wait, TASK_UNINTERRUPTIBLE); - - if (block_group_cache_done(cache)) { - finish_wait(&cache->caching_q, &wait); + caching_ctl = get_caching_control(cache); + if (!caching_ctl) return 0; - } - schedule(); - finish_wait(&cache->caching_q, &wait); - wait_event(cache->caching_q, block_group_cache_done(cache) || + wait_event(caching_ctl->wait, block_group_cache_done(cache) || (cache->free_space >= num_bytes)); + + put_caching_control(caching_ctl); + return 0; +} + +static noinline int +wait_block_group_cache_done(struct btrfs_block_group_cache *cache) +{ + struct btrfs_caching_control *caching_ctl; + DEFINE_WAIT(wait); + + caching_ctl = get_caching_control(cache); + if (!caching_ctl) + return 0; + + wait_event(caching_ctl->wait, block_group_cache_done(cache)); + + put_caching_control(caching_ctl); return 0; } @@ -3635,6 +4135,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, int last_ptr_loop = 0; int loop = 0; bool found_uncached_bg = false; + bool failed_cluster_refill = false; + bool failed_alloc = false; WARN_ON(num_bytes < root->sectorsize); btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); @@ -3732,7 +4234,16 @@ have_block_group: if (unlikely(block_group->ro)) goto loop; - if (last_ptr) { + /* + * Ok we want to try and use the cluster allocator, so lets look + * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will + * have tried the cluster allocator plenty of times at this + * point and not have found anything, so we are likely way too + * fragmented for the clustering stuff to find anything, so lets + * just skip it and let the allocator find whatever block it can + * find + */ + if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) { /* * the refill lock keeps out other * people trying to start a new cluster @@ -3807,9 +4318,11 @@ refill_cluster: spin_unlock(&last_ptr->refill_lock); goto checks; } - } else if (!cached && loop > LOOP_CACHING_NOWAIT) { + } else if (!cached && loop > LOOP_CACHING_NOWAIT + && !failed_cluster_refill) { spin_unlock(&last_ptr->refill_lock); + failed_cluster_refill = true; wait_block_group_cache_progress(block_group, num_bytes + empty_cluster + empty_size); goto have_block_group; @@ -3821,25 +4334,30 @@ refill_cluster: * cluster. Free the cluster we've been trying * to use, and go to the next block group */ - if (loop < LOOP_NO_EMPTY_SIZE) { - btrfs_return_cluster_to_free_space(NULL, - last_ptr); - spin_unlock(&last_ptr->refill_lock); - goto loop; - } + btrfs_return_cluster_to_free_space(NULL, last_ptr); spin_unlock(&last_ptr->refill_lock); + goto loop; } offset = btrfs_find_space_for_alloc(block_group, search_start, num_bytes, empty_size); - if (!offset && (cached || (!cached && - loop == LOOP_CACHING_NOWAIT))) { - goto loop; - } else if (!offset && (!cached && - loop > LOOP_CACHING_NOWAIT)) { + /* + * If we didn't find a chunk, and we haven't failed on this + * block group before, and this block group is in the middle of + * caching and we are ok with waiting, then go ahead and wait + * for progress to be made, and set failed_alloc to true. + * + * If failed_alloc is true then we've already waited on this + * block group once and should move on to the next block group. + */ + if (!offset && !failed_alloc && !cached && + loop > LOOP_CACHING_NOWAIT) { wait_block_group_cache_progress(block_group, - num_bytes + empty_size); + num_bytes + empty_size); + failed_alloc = true; goto have_block_group; + } else if (!offset) { + goto loop; } checks: search_start = stripe_align(root, offset); @@ -3881,9 +4399,13 @@ checks: search_start - offset); BUG_ON(offset > search_start); + update_reserved_extents(block_group, num_bytes, 1); + /* we are all good, lets return */ break; loop: + failed_cluster_refill = false; + failed_alloc = false; btrfs_put_block_group(block_group); } up_read(&space_info->groups_sem); @@ -3941,21 +4463,32 @@ loop: return ret; } -static void dump_space_info(struct btrfs_space_info *info, u64 bytes) +static void dump_space_info(struct btrfs_space_info *info, u64 bytes, + int dump_block_groups) { struct btrfs_block_group_cache *cache; + spin_lock(&info->lock); printk(KERN_INFO "space_info has %llu free, is %sfull\n", (unsigned long long)(info->total_bytes - info->bytes_used - - info->bytes_pinned - info->bytes_reserved), + info->bytes_pinned - info->bytes_reserved - + info->bytes_super), (info->full) ? "" : "not "); printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu," - " may_use=%llu, used=%llu\n", + " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu" + "\n", (unsigned long long)info->total_bytes, (unsigned long long)info->bytes_pinned, (unsigned long long)info->bytes_delalloc, (unsigned long long)info->bytes_may_use, - (unsigned long long)info->bytes_used); + (unsigned long long)info->bytes_used, + (unsigned long long)info->bytes_root, + (unsigned long long)info->bytes_super, + (unsigned long long)info->bytes_reserved); + spin_unlock(&info->lock); + + if (!dump_block_groups) + return; down_read(&info->groups_sem); list_for_each_entry(cache, &info->block_groups, list) { @@ -3973,12 +4506,12 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes) up_read(&info->groups_sem); } -static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 num_bytes, u64 min_alloc_size, - u64 empty_size, u64 hint_byte, - u64 search_end, struct btrfs_key *ins, - u64 data) +int btrfs_reserve_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 num_bytes, u64 min_alloc_size, + u64 empty_size, u64 hint_byte, + u64 search_end, struct btrfs_key *ins, + u64 data) { int ret; u64 search_start = 0; @@ -4023,7 +4556,7 @@ again: printk(KERN_ERR "btrfs allocation failed flags %llu, " "wanted %llu\n", (unsigned long long)data, (unsigned long long)num_bytes); - dump_space_info(sinfo, num_bytes); + dump_space_info(sinfo, num_bytes, 1); } return ret; @@ -4044,25 +4577,8 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) ret = btrfs_discard_extent(root, start, len); btrfs_add_free_space(cache, start, len); + update_reserved_extents(cache, len, 0); btrfs_put_block_group(cache); - update_reserved_extents(root, start, len, 0); - - return ret; -} - -int btrfs_reserve_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 num_bytes, u64 min_alloc_size, - u64 empty_size, u64 hint_byte, - u64 search_end, struct btrfs_key *ins, - u64 data) -{ - int ret; - ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size, - empty_size, hint_byte, search_end, ins, - data); - if (!ret) - update_reserved_extents(root, ins->objectid, ins->offset, 1); return ret; } @@ -4223,15 +4739,46 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, { int ret; struct btrfs_block_group_cache *block_group; + struct btrfs_caching_control *caching_ctl; + u64 start = ins->objectid; + u64 num_bytes = ins->offset; block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); cache_block_group(block_group); - wait_event(block_group->caching_q, - block_group_cache_done(block_group)); + caching_ctl = get_caching_control(block_group); - ret = btrfs_remove_free_space(block_group, ins->objectid, - ins->offset); - BUG_ON(ret); + if (!caching_ctl) { + BUG_ON(!block_group_cache_done(block_group)); + ret = btrfs_remove_free_space(block_group, start, num_bytes); + BUG_ON(ret); + } else { + mutex_lock(&caching_ctl->mutex); + + if (start >= caching_ctl->progress) { + ret = add_excluded_extent(root, start, num_bytes); + BUG_ON(ret); + } else if (start + num_bytes <= caching_ctl->progress) { + ret = btrfs_remove_free_space(block_group, + start, num_bytes); + BUG_ON(ret); + } else { + num_bytes = caching_ctl->progress - start; + ret = btrfs_remove_free_space(block_group, + start, num_bytes); + BUG_ON(ret); + + start = caching_ctl->progress; + num_bytes = ins->objectid + ins->offset - + caching_ctl->progress; + ret = add_excluded_extent(root, start, num_bytes); + BUG_ON(ret); + } + + mutex_unlock(&caching_ctl->mutex); + put_caching_control(caching_ctl); + } + + update_reserved_extents(block_group, ins->offset, 1); btrfs_put_block_group(block_group); ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 0, owner, offset, ins, 1); @@ -4255,9 +4802,9 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans, int ret; u64 flags = 0; - ret = __btrfs_reserve_extent(trans, root, num_bytes, num_bytes, - empty_size, hint_byte, search_end, - ins, 0); + ret = btrfs_reserve_extent(trans, root, num_bytes, num_bytes, + empty_size, hint_byte, search_end, + ins, 0); if (ret) return ret; @@ -4268,7 +4815,6 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans, } else BUG_ON(parent > 0); - update_reserved_extents(root, ins->objectid, ins->offset, 1); if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { struct btrfs_delayed_extent_op *extent_op; extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); @@ -4347,452 +4893,108 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, return buf; } -#if 0 -int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *leaf) -{ - u64 disk_bytenr; - u64 num_bytes; - struct btrfs_key key; - struct btrfs_file_extent_item *fi; - u32 nritems; - int i; - int ret; - - BUG_ON(!btrfs_is_leaf(leaf)); - nritems = btrfs_header_nritems(leaf); - - for (i = 0; i < nritems; i++) { - cond_resched(); - btrfs_item_key_to_cpu(leaf, &key, i); - - /* only extents have references, skip everything else */ - if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) - continue; - - fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); - - /* inline extents live in the btree, they don't have refs */ - if (btrfs_file_extent_type(leaf, fi) == - BTRFS_FILE_EXTENT_INLINE) - continue; - - disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); - - /* holes don't have refs */ - if (disk_bytenr == 0) - continue; - - num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); - ret = btrfs_free_extent(trans, root, disk_bytenr, num_bytes, - leaf->start, 0, key.objectid, 0); - BUG_ON(ret); - } - return 0; -} - -static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_leaf_ref *ref) -{ - int i; - int ret; - struct btrfs_extent_info *info; - struct refsort *sorted; - - if (ref->nritems == 0) - return 0; - - sorted = kmalloc(sizeof(*sorted) * ref->nritems, GFP_NOFS); - for (i = 0; i < ref->nritems; i++) { - sorted[i].bytenr = ref->extents[i].bytenr; - sorted[i].slot = i; - } - sort(sorted, ref->nritems, sizeof(struct refsort), refsort_cmp, NULL); - - /* - * the items in the ref were sorted when the ref was inserted - * into the ref cache, so this is already in order - */ - for (i = 0; i < ref->nritems; i++) { - info = ref->extents + sorted[i].slot; - ret = btrfs_free_extent(trans, root, info->bytenr, - info->num_bytes, ref->bytenr, - ref->owner, ref->generation, - info->objectid, 0); - - atomic_inc(&root->fs_info->throttle_gen); - wake_up(&root->fs_info->transaction_throttle); - cond_resched(); - - BUG_ON(ret); - info++; - } - - kfree(sorted); - return 0; -} - - -static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 start, - u64 len, u32 *refs) -{ - int ret; - - ret = btrfs_lookup_extent_refs(trans, root, start, len, refs); - BUG_ON(ret); - -#if 0 /* some debugging code in case we see problems here */ - /* if the refs count is one, it won't get increased again. But - * if the ref count is > 1, someone may be decreasing it at - * the same time we are. - */ - if (*refs != 1) { - struct extent_buffer *eb = NULL; - eb = btrfs_find_create_tree_block(root, start, len); - if (eb) - btrfs_tree_lock(eb); - - mutex_lock(&root->fs_info->alloc_mutex); - ret = lookup_extent_ref(NULL, root, start, len, refs); - BUG_ON(ret); - mutex_unlock(&root->fs_info->alloc_mutex); - - if (eb) { - btrfs_tree_unlock(eb); - free_extent_buffer(eb); - } - if (*refs == 1) { - printk(KERN_ERR "btrfs block %llu went down to one " - "during drop_snap\n", (unsigned long long)start); - } - - } -#endif - - cond_resched(); - return ret; -} +struct walk_control { + u64 refs[BTRFS_MAX_LEVEL]; + u64 flags[BTRFS_MAX_LEVEL]; + struct btrfs_key update_progress; + int stage; + int level; + int shared_level; + int update_ref; + int keep_locks; + int reada_slot; + int reada_count; +}; +#define DROP_REFERENCE 1 +#define UPDATE_BACKREF 2 -/* - * this is used while deleting old snapshots, and it drops the refs - * on a whole subtree starting from a level 1 node. - * - * The idea is to sort all the leaf pointers, and then drop the - * ref on all the leaves in order. Most of the time the leaves - * will have ref cache entries, so no leaf IOs will be required to - * find the extents they have references on. - * - * For each leaf, any references it has are also dropped in order - * - * This ends up dropping the references in something close to optimal - * order for reading and modifying the extent allocation tree. - */ -static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path) +static noinline void reada_walk_down(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct walk_control *wc, + struct btrfs_path *path) { u64 bytenr; - u64 root_owner; - u64 root_gen; - struct extent_buffer *eb = path->nodes[1]; - struct extent_buffer *leaf; - struct btrfs_leaf_ref *ref; - struct refsort *sorted = NULL; - int nritems = btrfs_header_nritems(eb); + u64 generation; + u64 refs; + u64 flags; + u64 last = 0; + u32 nritems; + u32 blocksize; + struct btrfs_key key; + struct extent_buffer *eb; int ret; - int i; - int refi = 0; - int slot = path->slots[1]; - u32 blocksize = btrfs_level_size(root, 0); - u32 refs; - - if (nritems == 0) - goto out; - - root_owner = btrfs_header_owner(eb); - root_gen = btrfs_header_generation(eb); - sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS); + int slot; + int nread = 0; - /* - * step one, sort all the leaf pointers so we don't scribble - * randomly into the extent allocation tree - */ - for (i = slot; i < nritems; i++) { - sorted[refi].bytenr = btrfs_node_blockptr(eb, i); - sorted[refi].slot = i; - refi++; + if (path->slots[wc->level] < wc->reada_slot) { + wc->reada_count = wc->reada_count * 2 / 3; + wc->reada_count = max(wc->reada_count, 2); + } else { + wc->reada_count = wc->reada_count * 3 / 2; + wc->reada_count = min_t(int, wc->reada_count, + BTRFS_NODEPTRS_PER_BLOCK(root)); } - /* - * nritems won't be zero, but if we're picking up drop_snapshot - * after a crash, slot might be > 0, so double check things - * just in case. - */ - if (refi == 0) - goto out; + eb = path->nodes[wc->level]; + nritems = btrfs_header_nritems(eb); + blocksize = btrfs_level_size(root, wc->level - 1); - sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL); + for (slot = path->slots[wc->level]; slot < nritems; slot++) { + if (nread >= wc->reada_count) + break; - /* - * the first loop frees everything the leaves point to - */ - for (i = 0; i < refi; i++) { - u64 ptr_gen; + cond_resched(); + bytenr = btrfs_node_blockptr(eb, slot); + generation = btrfs_node_ptr_generation(eb, slot); - bytenr = sorted[i].bytenr; + if (slot == path->slots[wc->level]) + goto reada; - /* - * check the reference count on this leaf. If it is > 1 - * we just decrement it below and don't update any - * of the refs the leaf points to. - */ - ret = drop_snap_lookup_refcount(trans, root, bytenr, - blocksize, &refs); - BUG_ON(ret); - if (refs != 1) + if (wc->stage == UPDATE_BACKREF && + generation <= root->root_key.offset) continue; - ptr_gen = btrfs_node_ptr_generation(eb, sorted[i].slot); - - /* - * the leaf only had one reference, which means the - * only thing pointing to this leaf is the snapshot - * we're deleting. It isn't possible for the reference - * count to increase again later - * - * The reference cache is checked for the leaf, - * and if found we'll be able to drop any refs held by - * the leaf without needing to read it in. - */ - ref = btrfs_lookup_leaf_ref(root, bytenr); - if (ref && ref->generation != ptr_gen) { - btrfs_free_leaf_ref(root, ref); - ref = NULL; - } - if (ref) { - ret = cache_drop_leaf_ref(trans, root, ref); - BUG_ON(ret); - btrfs_remove_leaf_ref(root, ref); - btrfs_free_leaf_ref(root, ref); - } else { - /* - * the leaf wasn't in the reference cache, so - * we have to read it. - */ - leaf = read_tree_block(root, bytenr, blocksize, - ptr_gen); - ret = btrfs_drop_leaf_ref(trans, root, leaf); - BUG_ON(ret); - free_extent_buffer(leaf); - } - atomic_inc(&root->fs_info->throttle_gen); - wake_up(&root->fs_info->transaction_throttle); - cond_resched(); - } - - /* - * run through the loop again to free the refs on the leaves. - * This is faster than doing it in the loop above because - * the leaves are likely to be clustered together. We end up - * working in nice chunks on the extent allocation tree. - */ - for (i = 0; i < refi; i++) { - bytenr = sorted[i].bytenr; - ret = btrfs_free_extent(trans, root, bytenr, - blocksize, eb->start, - root_owner, root_gen, 0, 1); + /* We don't lock the tree block, it's OK to be racy here */ + ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, + &refs, &flags); BUG_ON(ret); + BUG_ON(refs == 0); - atomic_inc(&root->fs_info->throttle_gen); - wake_up(&root->fs_info->transaction_throttle); - cond_resched(); - } -out: - kfree(sorted); - - /* - * update the path to show we've processed the entire level 1 - * node. This will get saved into the root's drop_snapshot_progress - * field so these drops are not repeated again if this transaction - * commits. - */ - path->slots[1] = nritems; - return 0; -} - -/* - * helper function for drop_snapshot, this walks down the tree dropping ref - * counts as it goes. - */ -static noinline int walk_down_tree(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, int *level) -{ - u64 root_owner; - u64 root_gen; - u64 bytenr; - u64 ptr_gen; - struct extent_buffer *next; - struct extent_buffer *cur; - struct extent_buffer *parent; - u32 blocksize; - int ret; - u32 refs; - - WARN_ON(*level < 0); - WARN_ON(*level >= BTRFS_MAX_LEVEL); - ret = drop_snap_lookup_refcount(trans, root, path->nodes[*level]->start, - path->nodes[*level]->len, &refs); - BUG_ON(ret); - if (refs > 1) - goto out; - - /* - * walk down to the last node level and free all the leaves - */ - while (*level >= 0) { - WARN_ON(*level < 0); - WARN_ON(*level >= BTRFS_MAX_LEVEL); - cur = path->nodes[*level]; - - if (btrfs_header_level(cur) != *level) - WARN_ON(1); - - if (path->slots[*level] >= - btrfs_header_nritems(cur)) - break; + if (wc->stage == DROP_REFERENCE) { + if (refs == 1) + goto reada; - /* the new code goes down to level 1 and does all the - * leaves pointed to that node in bulk. So, this check - * for level 0 will always be false. - * - * But, the disk format allows the drop_snapshot_progress - * field in the root to leave things in a state where - * a leaf will need cleaning up here. If someone crashes - * with the old code and then boots with the new code, - * we might find a leaf here. - */ - if (*level == 0) { - ret = btrfs_drop_leaf_ref(trans, root, cur); - BUG_ON(ret); - break; + if (wc->level == 1 && + (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) + continue; + if (!wc->update_ref || + generation <= root->root_key.offset) + continue; + btrfs_node_key_to_cpu(eb, &key, slot); + ret = btrfs_comp_cpu_keys(&key, + &wc->update_progress); + if (ret < 0) + continue; + } else { + if (wc->level == 1 && + (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) + continue; } - - /* - * once we get to level one, process the whole node - * at once, including everything below it. - */ - if (*level == 1) { - ret = drop_level_one_refs(trans, root, path); - BUG_ON(ret); +reada: + ret = readahead_tree_block(root, bytenr, blocksize, + generation); + if (ret) break; - } - - bytenr = btrfs_node_blockptr(cur, path->slots[*level]); - ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); - blocksize = btrfs_level_size(root, *level - 1); - - ret = drop_snap_lookup_refcount(trans, root, bytenr, - blocksize, &refs); - BUG_ON(ret); - - /* - * if there is more than one reference, we don't need - * to read that node to drop any references it has. We - * just drop the ref we hold on that node and move on to the - * next slot in this level. - */ - if (refs != 1) { - parent = path->nodes[*level]; - root_owner = btrfs_header_owner(parent); - root_gen = btrfs_header_generation(parent); - path->slots[*level]++; - - ret = btrfs_free_extent(trans, root, bytenr, - blocksize, parent->start, - root_owner, root_gen, - *level - 1, 1); - BUG_ON(ret); - - atomic_inc(&root->fs_info->throttle_gen); - wake_up(&root->fs_info->transaction_throttle); - cond_resched(); - - continue; - } - - /* - * we need to keep freeing things in the next level down. - * read the block and loop around to process it - */ - next = read_tree_block(root, bytenr, blocksize, ptr_gen); - WARN_ON(*level <= 0); - if (path->nodes[*level-1]) - free_extent_buffer(path->nodes[*level-1]); - path->nodes[*level-1] = next; - *level = btrfs_header_level(next); - path->slots[*level] = 0; - cond_resched(); + last = bytenr + blocksize; + nread++; } -out: - WARN_ON(*level < 0); - WARN_ON(*level >= BTRFS_MAX_LEVEL); - - if (path->nodes[*level] == root->node) { - parent = path->nodes[*level]; - bytenr = path->nodes[*level]->start; - } else { - parent = path->nodes[*level + 1]; - bytenr = btrfs_node_blockptr(parent, path->slots[*level + 1]); - } - - blocksize = btrfs_level_size(root, *level); - root_owner = btrfs_header_owner(parent); - root_gen = btrfs_header_generation(parent); - - /* - * cleanup and free the reference on the last node - * we processed - */ - ret = btrfs_free_extent(trans, root, bytenr, blocksize, - parent->start, root_owner, root_gen, - *level, 1); - free_extent_buffer(path->nodes[*level]); - path->nodes[*level] = NULL; - - *level += 1; - BUG_ON(ret); - - cond_resched(); - return 0; + wc->reada_slot = slot; } -#endif - -struct walk_control { - u64 refs[BTRFS_MAX_LEVEL]; - u64 flags[BTRFS_MAX_LEVEL]; - struct btrfs_key update_progress; - int stage; - int level; - int shared_level; - int update_ref; - int keep_locks; -}; - -#define DROP_REFERENCE 1 -#define UPDATE_BACKREF 2 /* * hepler to process tree block while walking down the tree. * - * when wc->stage == DROP_REFERENCE, this function checks - * reference count of the block. if the block is shared and - * we need update back refs for the subtree rooted at the - * block, this function changes wc->stage to UPDATE_BACKREF - * * when wc->stage == UPDATE_BACKREF, this function updates * back refs for pointers in the block. * @@ -4801,11 +5003,10 @@ struct walk_control { static noinline int walk_down_proc(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - struct walk_control *wc) + struct walk_control *wc, int lookup_info) { int level = wc->level; struct extent_buffer *eb = path->nodes[level]; - struct btrfs_key key; u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF; int ret; @@ -4817,8 +5018,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, * when reference count of tree block is 1, it won't increase * again. once full backref flag is set, we never clear it. */ - if ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || - (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag))) { + if (lookup_info && + ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || + (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) { BUG_ON(!path->locks[level]); ret = btrfs_lookup_extent_info(trans, root, eb->start, eb->len, @@ -4828,21 +5030,6 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, BUG_ON(wc->refs[level] == 0); } - if (wc->stage == DROP_REFERENCE && - wc->update_ref && wc->refs[level] > 1) { - BUG_ON(eb == root->node); - BUG_ON(path->slots[level] > 0); - if (level == 0) - btrfs_item_key_to_cpu(eb, &key, path->slots[level]); - else - btrfs_node_key_to_cpu(eb, &key, path->slots[level]); - if (btrfs_header_owner(eb) == root->root_key.objectid && - btrfs_comp_cpu_keys(&key, &wc->update_progress) >= 0) { - wc->stage = UPDATE_BACKREF; - wc->shared_level = level; - } - } - if (wc->stage == DROP_REFERENCE) { if (wc->refs[level] > 1) return 1; @@ -4879,6 +5066,136 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, } /* + * hepler to process tree block pointer. + * + * when wc->stage == DROP_REFERENCE, this function checks + * reference count of the block pointed to. if the block + * is shared and we need update back refs for the subtree + * rooted at the block, this function changes wc->stage to + * UPDATE_BACKREF. if the block is shared and there is no + * need to update back, this function drops the reference + * to the block. + * + * NOTE: return value 1 means we should stop walking down. + */ +static noinline int do_walk_down(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct walk_control *wc, int *lookup_info) +{ + u64 bytenr; + u64 generation; + u64 parent; + u32 blocksize; + struct btrfs_key key; + struct extent_buffer *next; + int level = wc->level; + int reada = 0; + int ret = 0; + + generation = btrfs_node_ptr_generation(path->nodes[level], + path->slots[level]); + /* + * if the lower level block was created before the snapshot + * was created, we know there is no need to update back refs + * for the subtree + */ + if (wc->stage == UPDATE_BACKREF && + generation <= root->root_key.offset) { + *lookup_info = 1; + return 1; + } + + bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); + blocksize = btrfs_level_size(root, level - 1); + + next = btrfs_find_tree_block(root, bytenr, blocksize); + if (!next) { + next = btrfs_find_create_tree_block(root, bytenr, blocksize); + reada = 1; + } + btrfs_tree_lock(next); + btrfs_set_lock_blocking(next); + + ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, + &wc->refs[level - 1], + &wc->flags[level - 1]); + BUG_ON(ret); + BUG_ON(wc->refs[level - 1] == 0); + *lookup_info = 0; + + if (wc->stage == DROP_REFERENCE) { + if (wc->refs[level - 1] > 1) { + if (level == 1 && + (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) + goto skip; + + if (!wc->update_ref || + generation <= root->root_key.offset) + goto skip; + + btrfs_node_key_to_cpu(path->nodes[level], &key, + path->slots[level]); + ret = btrfs_comp_cpu_keys(&key, &wc->update_progress); + if (ret < 0) + goto skip; + + wc->stage = UPDATE_BACKREF; + wc->shared_level = level - 1; + } + } else { + if (level == 1 && + (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) + goto skip; + } + + if (!btrfs_buffer_uptodate(next, generation)) { + btrfs_tree_unlock(next); + free_extent_buffer(next); + next = NULL; + *lookup_info = 1; + } + + if (!next) { + if (reada && level == 1) + reada_walk_down(trans, root, wc, path); + next = read_tree_block(root, bytenr, blocksize, generation); + btrfs_tree_lock(next); + btrfs_set_lock_blocking(next); + } + + level--; + BUG_ON(level != btrfs_header_level(next)); + path->nodes[level] = next; + path->slots[level] = 0; + path->locks[level] = 1; + wc->level = level; + if (wc->level == 1) + wc->reada_slot = 0; + return 0; +skip: + wc->refs[level - 1] = 0; + wc->flags[level - 1] = 0; + if (wc->stage == DROP_REFERENCE) { + if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { + parent = path->nodes[level]->start; + } else { + BUG_ON(root->root_key.objectid != + btrfs_header_owner(path->nodes[level])); + parent = 0; + } + + ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, + root->root_key.objectid, level - 1, 0); + BUG_ON(ret); + } + btrfs_tree_unlock(next); + free_extent_buffer(next); + *lookup_info = 1; + return 1; +} + +/* * hepler to process tree block while walking up the tree. * * when wc->stage == DROP_REFERENCE, this function drops @@ -4905,7 +5222,6 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, if (level < wc->shared_level) goto out; - BUG_ON(wc->refs[level] <= 1); ret = find_next_key(path, level + 1, &wc->update_progress); if (ret > 0) wc->update_ref = 0; @@ -4936,8 +5252,6 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, path->locks[level] = 0; return 1; } - } else { - BUG_ON(level != 0); } } @@ -4990,39 +5304,28 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct walk_control *wc) { - struct extent_buffer *next; - struct extent_buffer *cur; - u64 bytenr; - u64 ptr_gen; - u32 blocksize; int level = wc->level; + int lookup_info = 1; int ret; while (level >= 0) { - cur = path->nodes[level]; - BUG_ON(path->slots[level] >= btrfs_header_nritems(cur)); + if (path->slots[level] >= + btrfs_header_nritems(path->nodes[level])) + break; - ret = walk_down_proc(trans, root, path, wc); + ret = walk_down_proc(trans, root, path, wc, lookup_info); if (ret > 0) break; if (level == 0) break; - bytenr = btrfs_node_blockptr(cur, path->slots[level]); - blocksize = btrfs_level_size(root, level - 1); - ptr_gen = btrfs_node_ptr_generation(cur, path->slots[level]); - - next = read_tree_block(root, bytenr, blocksize, ptr_gen); - btrfs_tree_lock(next); - btrfs_set_lock_blocking(next); - - level--; - BUG_ON(level != btrfs_header_level(next)); - path->nodes[level] = next; - path->slots[level] = 0; - path->locks[level] = 1; - wc->level = level; + ret = do_walk_down(trans, root, path, wc, &lookup_info); + if (ret > 0) { + path->slots[level]++; + continue; + } + level = wc->level; } return 0; } @@ -5112,9 +5415,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref) err = ret; goto out; } - btrfs_node_key_to_cpu(path->nodes[level], &key, - path->slots[level]); - WARN_ON(memcmp(&key, &wc->update_progress, sizeof(key))); + WARN_ON(ret > 0); /* * unlock our path, this is safe because only this @@ -5149,6 +5450,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref) wc->stage = DROP_REFERENCE; wc->update_ref = update_ref; wc->keep_locks = 0; + wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); while (1) { ret = walk_down_tree(trans, root, path, wc); @@ -5201,9 +5503,24 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref) ret = btrfs_del_root(trans, tree_root, &root->root_key); BUG_ON(ret); - free_extent_buffer(root->node); - free_extent_buffer(root->commit_root); - kfree(root); + if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { + ret = btrfs_find_last_root(tree_root, root->root_key.objectid, + NULL, NULL); + BUG_ON(ret < 0); + if (ret > 0) { + ret = btrfs_del_orphan_item(trans, tree_root, + root->root_key.objectid); + BUG_ON(ret); + } + } + + if (root->in_radix) { + btrfs_free_fs_root(tree_root->fs_info, root); + } else { + free_extent_buffer(root->node); + free_extent_buffer(root->commit_root); + kfree(root); + } out: btrfs_end_transaction(trans, tree_root); kfree(wc); @@ -5255,6 +5572,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, wc->stage = DROP_REFERENCE; wc->update_ref = 0; wc->keep_locks = 1; + wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); while (1) { wret = walk_down_tree(trans, root, path, wc); @@ -5397,9 +5715,9 @@ static noinline int relocate_data_extent(struct inode *reloc_inode, lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS); while (1) { int ret; - spin_lock(&em_tree->lock); + write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); - spin_unlock(&em_tree->lock); + write_unlock(&em_tree->lock); if (ret != -EEXIST) { free_extent_map(em); break; @@ -6842,287 +7160,86 @@ int btrfs_prepare_block_group_relocation(struct btrfs_root *root, return 0; } -#if 0 -static int __insert_orphan_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 objectid, u64 size) -{ - struct btrfs_path *path; - struct btrfs_inode_item *item; - struct extent_buffer *leaf; - int ret; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - path->leave_spinning = 1; - ret = btrfs_insert_empty_inode(trans, root, path, objectid); - if (ret) - goto out; - - leaf = path->nodes[0]; - item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); - memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item)); - btrfs_set_inode_generation(leaf, item, 1); - btrfs_set_inode_size(leaf, item, size); - btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); - btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS); - btrfs_mark_buffer_dirty(leaf); - btrfs_release_path(root, path); -out: - btrfs_free_path(path); - return ret; -} - -static noinline struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *group) +/* + * checks to see if its even possible to relocate this block group. + * + * @return - -1 if it's not a good idea to relocate this block group, 0 if its + * ok to go ahead and try. + */ +int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) { - struct inode *inode = NULL; - struct btrfs_trans_handle *trans; - struct btrfs_root *root; - struct btrfs_key root_key; - u64 objectid = BTRFS_FIRST_FREE_OBJECTID; - int err = 0; + struct btrfs_block_group_cache *block_group; + struct btrfs_space_info *space_info; + struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; + struct btrfs_device *device; + int full = 0; + int ret = 0; - root_key.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID; - root_key.type = BTRFS_ROOT_ITEM_KEY; - root_key.offset = (u64)-1; - root = btrfs_read_fs_root_no_name(fs_info, &root_key); - if (IS_ERR(root)) - return ERR_CAST(root); + block_group = btrfs_lookup_block_group(root->fs_info, bytenr); - trans = btrfs_start_transaction(root, 1); - BUG_ON(!trans); + /* odd, couldn't find the block group, leave it alone */ + if (!block_group) + return -1; - err = btrfs_find_free_objectid(trans, root, objectid, &objectid); - if (err) + /* no bytes used, we're good */ + if (!btrfs_block_group_used(&block_group->item)) goto out; - err = __insert_orphan_inode(trans, root, objectid, group->key.offset); - BUG_ON(err); - - err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0, - group->key.offset, 0, group->key.offset, - 0, 0, 0); - BUG_ON(err); - - inode = btrfs_iget_locked(root->fs_info->sb, objectid, root); - if (inode->i_state & I_NEW) { - BTRFS_I(inode)->root = root; - BTRFS_I(inode)->location.objectid = objectid; - BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; - BTRFS_I(inode)->location.offset = 0; - btrfs_read_locked_inode(inode); - unlock_new_inode(inode); - BUG_ON(is_bad_inode(inode)); - } else { - BUG_ON(1); - } - BTRFS_I(inode)->index_cnt = group->key.objectid; - - err = btrfs_orphan_add(trans, inode); -out: - btrfs_end_transaction(trans, root); - if (err) { - if (inode) - iput(inode); - inode = ERR_PTR(err); - } - return inode; -} - -int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) -{ - - struct btrfs_ordered_sum *sums; - struct btrfs_sector_sum *sector_sum; - struct btrfs_ordered_extent *ordered; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct list_head list; - size_t offset; - int ret; - u64 disk_bytenr; - - INIT_LIST_HEAD(&list); - - ordered = btrfs_lookup_ordered_extent(inode, file_pos); - BUG_ON(ordered->file_offset != file_pos || ordered->len != len); - - disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt; - ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr, - disk_bytenr + len - 1, &list); - - while (!list_empty(&list)) { - sums = list_entry(list.next, struct btrfs_ordered_sum, list); - list_del_init(&sums->list); - - sector_sum = sums->sums; - sums->bytenr = ordered->start; + space_info = block_group->space_info; + spin_lock(&space_info->lock); - offset = 0; - while (offset < sums->len) { - sector_sum->bytenr += ordered->start - disk_bytenr; - sector_sum++; - offset += root->sectorsize; - } + full = space_info->full; - btrfs_add_ordered_sum(inode, ordered, sums); + /* + * if this is the last block group we have in this space, we can't + * relocate it unless we're able to allocate a new chunk below. + * + * Otherwise, we need to make sure we have room in the space to handle + * all of the extents from this block group. If we can, we're good + */ + if ((space_info->total_bytes != block_group->key.offset) && + (space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly + + btrfs_block_group_used(&block_group->item) < + space_info->total_bytes)) { + spin_unlock(&space_info->lock); + goto out; } - btrfs_put_ordered_extent(ordered); - return 0; -} - -int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start) -{ - struct btrfs_trans_handle *trans; - struct btrfs_path *path; - struct btrfs_fs_info *info = root->fs_info; - struct extent_buffer *leaf; - struct inode *reloc_inode; - struct btrfs_block_group_cache *block_group; - struct btrfs_key key; - u64 skipped; - u64 cur_byte; - u64 total_found; - u32 nritems; - int ret; - int progress; - int pass = 0; - - root = root->fs_info->extent_root; - - block_group = btrfs_lookup_block_group(info, group_start); - BUG_ON(!block_group); - - printk(KERN_INFO "btrfs relocating block group %llu flags %llu\n", - (unsigned long long)block_group->key.objectid, - (unsigned long long)block_group->flags); - - path = btrfs_alloc_path(); - BUG_ON(!path); - - reloc_inode = create_reloc_inode(info, block_group); - BUG_ON(IS_ERR(reloc_inode)); - - __alloc_chunk_for_shrink(root, block_group, 1); - set_block_group_readonly(block_group); - - btrfs_start_delalloc_inodes(info->tree_root); - btrfs_wait_ordered_extents(info->tree_root, 0); -again: - skipped = 0; - total_found = 0; - progress = 0; - key.objectid = block_group->key.objectid; - key.offset = 0; - key.type = 0; - cur_byte = key.objectid; - - trans = btrfs_start_transaction(info->tree_root, 1); - btrfs_commit_transaction(trans, info->tree_root); + spin_unlock(&space_info->lock); - mutex_lock(&root->fs_info->cleaner_mutex); - btrfs_clean_old_snapshots(info->tree_root); - btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1); - mutex_unlock(&root->fs_info->cleaner_mutex); + /* + * ok we don't have enough space, but maybe we have free space on our + * devices to allocate new chunks for relocation, so loop through our + * alloc devices and guess if we have enough space. However, if we + * were marked as full, then we know there aren't enough chunks, and we + * can just return. + */ + ret = -1; + if (full) + goto out; - trans = btrfs_start_transaction(info->tree_root, 1); - btrfs_commit_transaction(trans, info->tree_root); + mutex_lock(&root->fs_info->chunk_mutex); + list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { + u64 min_free = btrfs_block_group_used(&block_group->item); + u64 dev_offset, max_avail; - while (1) { - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out; -next: - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - if (path->slots[0] >= nritems) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - goto out; - if (ret == 1) { - ret = 0; + /* + * check to make sure we can actually find a chunk with enough + * space to fit our block group in. + */ + if (device->total_bytes > device->bytes_used + min_free) { + ret = find_free_dev_extent(NULL, device, min_free, + &dev_offset, &max_avail); + if (!ret) break; - } - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - } - - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - - if (key.objectid >= block_group->key.objectid + - block_group->key.offset) - break; - - if (progress && need_resched()) { - btrfs_release_path(root, path); - cond_resched(); - progress = 0; - continue; - } - progress = 1; - - if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY || - key.objectid + key.offset <= cur_byte) { - path->slots[0]++; - goto next; + ret = -1; } - - total_found++; - cur_byte = key.objectid + key.offset; - btrfs_release_path(root, path); - - __alloc_chunk_for_shrink(root, block_group, 0); - ret = relocate_one_extent(root, path, &key, block_group, - reloc_inode, pass); - BUG_ON(ret < 0); - if (ret > 0) - skipped++; - - key.objectid = cur_byte; - key.type = 0; - key.offset = 0; } - - btrfs_release_path(root, path); - - if (pass == 0) { - btrfs_wait_ordered_range(reloc_inode, 0, (u64)-1); - invalidate_mapping_pages(reloc_inode->i_mapping, 0, -1); - } - - if (total_found > 0) { - printk(KERN_INFO "btrfs found %llu extents in pass %d\n", - (unsigned long long)total_found, pass); - pass++; - if (total_found == skipped && pass > 2) { - iput(reloc_inode); - reloc_inode = create_reloc_inode(info, block_group); - pass = 0; - } - goto again; - } - - /* delete reloc_inode */ - iput(reloc_inode); - - /* unpin extents in this range */ - trans = btrfs_start_transaction(info->tree_root, 1); - btrfs_commit_transaction(trans, info->tree_root); - - spin_lock(&block_group->lock); - WARN_ON(block_group->pinned > 0); - WARN_ON(block_group->reserved > 0); - WARN_ON(btrfs_block_group_used(&block_group->item) > 0); - spin_unlock(&block_group->lock); - btrfs_put_block_group(block_group); - ret = 0; + mutex_unlock(&root->fs_info->chunk_mutex); out: - btrfs_free_path(path); + btrfs_put_block_group(block_group); return ret; } -#endif static int find_first_block_group(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *key) @@ -7165,8 +7282,18 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) { struct btrfs_block_group_cache *block_group; struct btrfs_space_info *space_info; + struct btrfs_caching_control *caching_ctl; struct rb_node *n; + down_write(&info->extent_commit_sem); + while (!list_empty(&info->caching_block_groups)) { + caching_ctl = list_entry(info->caching_block_groups.next, + struct btrfs_caching_control, list); + list_del(&caching_ctl->list); + put_caching_control(caching_ctl); + } + up_write(&info->extent_commit_sem); + spin_lock(&info->block_group_cache_lock); while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { block_group = rb_entry(n, struct btrfs_block_group_cache, @@ -7180,8 +7307,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) up_write(&block_group->space_info->groups_sem); if (block_group->cached == BTRFS_CACHE_STARTED) - wait_event(block_group->caching_q, - block_group_cache_done(block_group)); + wait_block_group_cache_done(block_group); btrfs_remove_free_space_cache(block_group); @@ -7251,7 +7377,6 @@ int btrfs_read_block_groups(struct btrfs_root *root) spin_lock_init(&cache->lock); spin_lock_init(&cache->tree_lock); cache->fs_info = info; - init_waitqueue_head(&cache->caching_q); INIT_LIST_HEAD(&cache->list); INIT_LIST_HEAD(&cache->cluster_list); @@ -7273,8 +7398,6 @@ int btrfs_read_block_groups(struct btrfs_root *root) cache->flags = btrfs_block_group_flags(&cache->item); cache->sectorsize = root->sectorsize; - remove_sb_from_cache(root, cache); - /* * check for two cases, either we are full, and therefore * don't need to bother with the caching work since we won't @@ -7283,13 +7406,19 @@ int btrfs_read_block_groups(struct btrfs_root *root) * time, particularly in the full case. */ if (found_key.offset == btrfs_block_group_used(&cache->item)) { + exclude_super_stripes(root, cache); + cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; + free_excluded_extents(root, cache); } else if (btrfs_block_group_used(&cache->item) == 0) { + exclude_super_stripes(root, cache); + cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; add_new_free_space(cache, root->fs_info, found_key.objectid, found_key.objectid + found_key.offset); + free_excluded_extents(root, cache); } ret = update_space_info(info, cache->flags, found_key.offset, @@ -7297,6 +7426,10 @@ int btrfs_read_block_groups(struct btrfs_root *root) &space_info); BUG_ON(ret); cache->space_info = space_info; + spin_lock(&cache->space_info->lock); + cache->space_info->bytes_super += cache->bytes_super; + spin_unlock(&cache->space_info->lock); + down_write(&space_info->groups_sem); list_add_tail(&cache->list, &space_info->block_groups); up_write(&space_info->groups_sem); @@ -7346,7 +7479,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, atomic_set(&cache->count, 1); spin_lock_init(&cache->lock); spin_lock_init(&cache->tree_lock); - init_waitqueue_head(&cache->caching_q); INIT_LIST_HEAD(&cache->list); INIT_LIST_HEAD(&cache->cluster_list); @@ -7355,15 +7487,23 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, cache->flags = type; btrfs_set_block_group_flags(&cache->item, type); + cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; - remove_sb_from_cache(root, cache); + exclude_super_stripes(root, cache); add_new_free_space(cache, root->fs_info, chunk_offset, chunk_offset + size); + free_excluded_extents(root, cache); + ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, &cache->space_info); BUG_ON(ret); + + spin_lock(&cache->space_info->lock); + cache->space_info->bytes_super += cache->bytes_super; + spin_unlock(&cache->space_info->lock); + down_write(&cache->space_info->groups_sem); list_add_tail(&cache->list, &cache->space_info->block_groups); up_write(&cache->space_info->groups_sem); @@ -7429,8 +7569,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, up_write(&block_group->space_info->groups_sem); if (block_group->cached == BTRFS_CACHE_STARTED) - wait_event(block_group->caching_q, - block_group_cache_done(block_group)); + wait_block_group_cache_done(block_group); btrfs_remove_free_space_cache(block_group); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 6826018..96577e8 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -280,6 +280,14 @@ static struct extent_buffer *buffer_search(struct extent_io_tree *tree, return NULL; } +static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, + struct extent_state *other) +{ + if (tree->ops && tree->ops->merge_extent_hook) + tree->ops->merge_extent_hook(tree->mapping->host, new, + other); +} + /* * utility function to look for merge candidates inside a given range. * Any extents with matching state are merged together into a single @@ -303,6 +311,7 @@ static int merge_state(struct extent_io_tree *tree, other = rb_entry(other_node, struct extent_state, rb_node); if (other->end == state->start - 1 && other->state == state->state) { + merge_cb(tree, state, other); state->start = other->start; other->tree = NULL; rb_erase(&other->rb_node, &tree->state); @@ -314,33 +323,37 @@ static int merge_state(struct extent_io_tree *tree, other = rb_entry(other_node, struct extent_state, rb_node); if (other->start == state->end + 1 && other->state == state->state) { + merge_cb(tree, state, other); other->start = state->start; state->tree = NULL; rb_erase(&state->rb_node, &tree->state); free_extent_state(state); + state = NULL; } } + return 0; } -static void set_state_cb(struct extent_io_tree *tree, +static int set_state_cb(struct extent_io_tree *tree, struct extent_state *state, unsigned long bits) { if (tree->ops && tree->ops->set_bit_hook) { - tree->ops->set_bit_hook(tree->mapping->host, state->start, - state->end, state->state, bits); + return tree->ops->set_bit_hook(tree->mapping->host, + state->start, state->end, + state->state, bits); } + + return 0; } static void clear_state_cb(struct extent_io_tree *tree, struct extent_state *state, unsigned long bits) { - if (tree->ops && tree->ops->clear_bit_hook) { - tree->ops->clear_bit_hook(tree->mapping->host, state->start, - state->end, state->state, bits); - } + if (tree->ops && tree->ops->clear_bit_hook) + tree->ops->clear_bit_hook(tree->mapping->host, state, bits); } /* @@ -358,6 +371,7 @@ static int insert_state(struct extent_io_tree *tree, int bits) { struct rb_node *node; + int ret; if (end < start) { printk(KERN_ERR "btrfs end < start %llu %llu\n", @@ -365,12 +379,15 @@ static int insert_state(struct extent_io_tree *tree, (unsigned long long)start); WARN_ON(1); } + state->start = start; + state->end = end; + ret = set_state_cb(tree, state, bits); + if (ret) + return ret; + if (bits & EXTENT_DIRTY) tree->dirty_bytes += end - start + 1; - set_state_cb(tree, state, bits); state->state |= bits; - state->start = start; - state->end = end; node = tree_insert(&tree->state, end, &state->rb_node); if (node) { struct extent_state *found; @@ -387,6 +404,15 @@ static int insert_state(struct extent_io_tree *tree, return 0; } +static int split_cb(struct extent_io_tree *tree, struct extent_state *orig, + u64 split) +{ + if (tree->ops && tree->ops->split_extent_hook) + return tree->ops->split_extent_hook(tree->mapping->host, + orig, split); + return 0; +} + /* * split a given extent state struct in two, inserting the preallocated * struct 'prealloc' as the newly created second half. 'split' indicates an @@ -405,6 +431,9 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, struct extent_state *prealloc, u64 split) { struct rb_node *node; + + split_cb(tree, orig, split); + prealloc->start = orig->start; prealloc->end = split - 1; prealloc->state = orig->state; @@ -431,7 +460,8 @@ static int clear_state_bit(struct extent_io_tree *tree, struct extent_state *state, int bits, int wake, int delete) { - int ret = state->state & bits; + int bits_to_clear = bits & ~EXTENT_DO_ACCOUNTING; + int ret = state->state & bits_to_clear; if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { u64 range = state->end - state->start + 1; @@ -439,7 +469,7 @@ static int clear_state_bit(struct extent_io_tree *tree, tree->dirty_bytes -= range; } clear_state_cb(tree, state, bits); - state->state &= ~bits; + state->state &= ~bits_to_clear; if (wake) wake_up(&state->wq); if (delete || state->state == 0) { @@ -471,10 +501,14 @@ static int clear_state_bit(struct extent_io_tree *tree, * bits were already set, or zero if none of the bits were already set. */ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int wake, int delete, gfp_t mask) + int bits, int wake, int delete, + struct extent_state **cached_state, + gfp_t mask) { struct extent_state *state; + struct extent_state *cached; struct extent_state *prealloc = NULL; + struct rb_node *next_node; struct rb_node *node; u64 last_end; int err; @@ -488,6 +522,17 @@ again: } spin_lock(&tree->lock); + if (cached_state) { + cached = *cached_state; + *cached_state = NULL; + cached_state = NULL; + if (cached && cached->tree && cached->start == start) { + atomic_dec(&cached->refs); + state = cached; + goto hit_next; + } + free_extent_state(cached); + } /* * this search will find the extents that end after * our range starts @@ -496,6 +541,7 @@ again: if (!node) goto out; state = rb_entry(node, struct extent_state, rb_node); +hit_next: if (state->start > end) goto out; WARN_ON(state->end < start); @@ -526,13 +572,11 @@ again: if (err) goto out; if (state->end <= end) { - set |= clear_state_bit(tree, state, bits, - wake, delete); + set |= clear_state_bit(tree, state, bits, wake, + delete); if (last_end == (u64)-1) goto out; start = last_end + 1; - } else { - start = state->start; } goto search_again; } @@ -547,19 +591,30 @@ again: prealloc = alloc_extent_state(GFP_ATOMIC); err = split_state(tree, state, prealloc, end + 1); BUG_ON(err == -EEXIST); - if (wake) wake_up(&state->wq); - set |= clear_state_bit(tree, prealloc, bits, - wake, delete); + + set |= clear_state_bit(tree, prealloc, bits, wake, delete); + prealloc = NULL; goto out; } + if (state->end < end && prealloc && !need_resched()) + next_node = rb_next(&state->rb_node); + else + next_node = NULL; + set |= clear_state_bit(tree, state, bits, wake, delete); if (last_end == (u64)-1) goto out; start = last_end + 1; + if (start <= end && next_node) { + state = rb_entry(next_node, struct extent_state, + rb_node); + if (state->start == start) + goto hit_next; + } goto search_again; out: @@ -641,40 +696,59 @@ out: return 0; } -static void set_state_bits(struct extent_io_tree *tree, +static int set_state_bits(struct extent_io_tree *tree, struct extent_state *state, int bits) { + int ret; + + ret = set_state_cb(tree, state, bits); + if (ret) + return ret; + if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { u64 range = state->end - state->start + 1; tree->dirty_bytes += range; } - set_state_cb(tree, state, bits); state->state |= bits; + + return 0; +} + +static void cache_state(struct extent_state *state, + struct extent_state **cached_ptr) +{ + if (cached_ptr && !(*cached_ptr)) { + if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) { + *cached_ptr = state; + atomic_inc(&state->refs); + } + } } /* - * set some bits on a range in the tree. This may require allocations - * or sleeping, so the gfp mask is used to indicate what is allowed. + * set some bits on a range in the tree. This may require allocations or + * sleeping, so the gfp mask is used to indicate what is allowed. * - * If 'exclusive' == 1, this will fail with -EEXIST if some part of the - * range already has the desired bits set. The start of the existing - * range is returned in failed_start in this case. + * If any of the exclusive bits are set, this will fail with -EEXIST if some + * part of the range already has the desired bits set. The start of the + * existing range is returned in failed_start in this case. * - * [start, end] is inclusive - * This takes the tree lock. + * [start, end] is inclusive This takes the tree lock. */ + static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int exclusive, u64 *failed_start, + int bits, int exclusive_bits, u64 *failed_start, + struct extent_state **cached_state, gfp_t mask) { struct extent_state *state; struct extent_state *prealloc = NULL; struct rb_node *node; int err = 0; - int set; u64 last_start; u64 last_end; + again: if (!prealloc && (mask & __GFP_WAIT)) { prealloc = alloc_extent_state(mask); @@ -683,6 +757,13 @@ again: } spin_lock(&tree->lock); + if (cached_state && *cached_state) { + state = *cached_state; + if (state->start == start && state->tree) { + node = &state->rb_node; + goto hit_next; + } + } /* * this search will find all the extents that end after * our range starts. @@ -694,8 +775,8 @@ again: BUG_ON(err == -EEXIST); goto out; } - state = rb_entry(node, struct extent_state, rb_node); +hit_next: last_start = state->start; last_end = state->end; @@ -706,17 +787,32 @@ again: * Just lock what we found and keep going */ if (state->start == start && state->end <= end) { - set = state->state & bits; - if (set && exclusive) { + struct rb_node *next_node; + if (state->state & exclusive_bits) { *failed_start = state->start; err = -EEXIST; goto out; } - set_state_bits(tree, state, bits); + + err = set_state_bits(tree, state, bits); + if (err) + goto out; + + cache_state(state, cached_state); merge_state(tree, state); if (last_end == (u64)-1) goto out; + start = last_end + 1; + if (start < end && prealloc && !need_resched()) { + next_node = rb_next(node); + if (next_node) { + state = rb_entry(next_node, struct extent_state, + rb_node); + if (state->start == start) + goto hit_next; + } + } goto search_again; } @@ -737,8 +833,7 @@ again: * desired bit on it. */ if (state->start < start) { - set = state->state & bits; - if (exclusive && set) { + if (state->state & exclusive_bits) { *failed_start = start; err = -EEXIST; goto out; @@ -749,13 +844,14 @@ again: if (err) goto out; if (state->end <= end) { - set_state_bits(tree, state, bits); + err = set_state_bits(tree, state, bits); + if (err) + goto out; + cache_state(state, cached_state); merge_state(tree, state); if (last_end == (u64)-1) goto out; start = last_end + 1; - } else { - start = state->start; } goto search_again; } @@ -774,10 +870,13 @@ again: this_end = last_start - 1; err = insert_state(tree, prealloc, start, this_end, bits); - prealloc = NULL; BUG_ON(err == -EEXIST); - if (err) + if (err) { + prealloc = NULL; goto out; + } + cache_state(prealloc, cached_state); + prealloc = NULL; start = this_end + 1; goto search_again; } @@ -788,8 +887,7 @@ again: * on the first half */ if (state->start <= end && state->end > end) { - set = state->state & bits; - if (exclusive && set) { + if (state->state & exclusive_bits) { *failed_start = start; err = -EEXIST; goto out; @@ -797,7 +895,12 @@ again: err = split_state(tree, state, prealloc, end + 1); BUG_ON(err == -EEXIST); - set_state_bits(tree, prealloc, bits); + err = set_state_bits(tree, prealloc, bits); + if (err) { + prealloc = NULL; + goto out; + } + cache_state(prealloc, cached_state); merge_state(tree, prealloc); prealloc = NULL; goto out; @@ -826,86 +929,65 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL, - mask); -} - -int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask) -{ - return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, mask); + NULL, mask); } int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, int bits, gfp_t mask) { return set_extent_bit(tree, start, end, bits, 0, NULL, - mask); + NULL, mask); } int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, int bits, gfp_t mask) { - return clear_extent_bit(tree, start, end, bits, 0, 0, mask); + return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask); } int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { return set_extent_bit(tree, start, end, - EXTENT_DELALLOC | EXTENT_DIRTY, - 0, NULL, mask); + EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE, + 0, NULL, NULL, mask); } int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { return clear_extent_bit(tree, start, end, - EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask); -} - -int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask) -{ - return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0, mask); + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING, 0, 0, + NULL, mask); } int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL, - mask); + NULL, mask); } static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { - return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask); + return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, + NULL, mask); } int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL, - mask); + NULL, mask); } static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { - return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask); -} - -static int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask) -{ - return set_extent_bit(tree, start, end, EXTENT_WRITEBACK, - 0, NULL, mask); -} - -static int clear_extent_writeback(struct extent_io_tree *tree, u64 start, - u64 end, gfp_t mask) -{ - return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask); + return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, + NULL, mask); } int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end) @@ -917,13 +999,15 @@ int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end) * either insert or lock state struct between start and end use mask to tell * us if waiting is desired. */ -int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) +int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + int bits, struct extent_state **cached_state, gfp_t mask) { int err; u64 failed_start; while (1) { - err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1, - &failed_start, mask); + err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits, + EXTENT_LOCKED, &failed_start, + cached_state, mask); if (err == -EEXIST && (mask & __GFP_WAIT)) { wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); start = failed_start; @@ -935,27 +1019,40 @@ int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) return err; } +int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) +{ + return lock_extent_bits(tree, start, end, 0, NULL, mask); +} + int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { int err; u64 failed_start; - err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1, - &failed_start, mask); + err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, + &failed_start, NULL, mask); if (err == -EEXIST) { if (failed_start > start) clear_extent_bit(tree, start, failed_start - 1, - EXTENT_LOCKED, 1, 0, mask); + EXTENT_LOCKED, 1, 0, NULL, mask); return 0; } return 1; } +int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached, gfp_t mask) +{ + return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, + mask); +} + int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) { - return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask); + return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL, + mask); } /* @@ -974,7 +1071,6 @@ int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end) page_cache_release(page); index++; } - set_extent_dirty(tree, start, end, GFP_NOFS); return 0; } @@ -994,7 +1090,6 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) page_cache_release(page); index++; } - set_extent_writeback(tree, start, end, GFP_NOFS); return 0; } @@ -1232,6 +1327,7 @@ static noinline u64 find_lock_delalloc_range(struct inode *inode, u64 delalloc_start; u64 delalloc_end; u64 found; + struct extent_state *cached_state = NULL; int ret; int loops = 0; @@ -1269,6 +1365,7 @@ again: /* some of the pages are gone, lets avoid looping by * shortening the size of the delalloc range we're searching */ + free_extent_state(cached_state); if (!loops) { unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1); max_bytes = PAGE_CACHE_SIZE - offset; @@ -1282,18 +1379,21 @@ again: BUG_ON(ret); /* step three, lock the state bits for the whole range */ - lock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS); + lock_extent_bits(tree, delalloc_start, delalloc_end, + 0, &cached_state, GFP_NOFS); /* then test to make sure it is all still delalloc */ ret = test_range_bit(tree, delalloc_start, delalloc_end, - EXTENT_DELALLOC, 1); + EXTENT_DELALLOC, 1, cached_state); if (!ret) { - unlock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS); + unlock_extent_cached(tree, delalloc_start, delalloc_end, + &cached_state, GFP_NOFS); __unlock_for_delalloc(inode, locked_page, delalloc_start, delalloc_end); cond_resched(); goto again; } + free_extent_state(cached_state); *start = delalloc_start; *end = delalloc_end; out_failed: @@ -1303,11 +1403,7 @@ out_failed: int extent_clear_unlock_delalloc(struct inode *inode, struct extent_io_tree *tree, u64 start, u64 end, struct page *locked_page, - int unlock_pages, - int clear_unlock, - int clear_delalloc, int clear_dirty, - int set_writeback, - int end_writeback) + unsigned long op) { int ret; struct page *pages[16]; @@ -1317,16 +1413,21 @@ int extent_clear_unlock_delalloc(struct inode *inode, int i; int clear_bits = 0; - if (clear_unlock) + if (op & EXTENT_CLEAR_UNLOCK) clear_bits |= EXTENT_LOCKED; - if (clear_dirty) + if (op & EXTENT_CLEAR_DIRTY) clear_bits |= EXTENT_DIRTY; - if (clear_delalloc) + if (op & EXTENT_CLEAR_DELALLOC) clear_bits |= EXTENT_DELALLOC; - clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS); - if (!(unlock_pages || clear_dirty || set_writeback || end_writeback)) + if (op & EXTENT_CLEAR_ACCOUNTING) + clear_bits |= EXTENT_DO_ACCOUNTING; + + clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); + if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | + EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK | + EXTENT_SET_PRIVATE2))) return 0; while (nr_pages > 0) { @@ -1334,17 +1435,21 @@ int extent_clear_unlock_delalloc(struct inode *inode, min_t(unsigned long, nr_pages, ARRAY_SIZE(pages)), pages); for (i = 0; i < ret; i++) { + + if (op & EXTENT_SET_PRIVATE2) + SetPagePrivate2(pages[i]); + if (pages[i] == locked_page) { page_cache_release(pages[i]); continue; } - if (clear_dirty) + if (op & EXTENT_CLEAR_DIRTY) clear_page_dirty_for_io(pages[i]); - if (set_writeback) + if (op & EXTENT_SET_WRITEBACK) set_page_writeback(pages[i]); - if (end_writeback) + if (op & EXTENT_END_WRITEBACK) end_page_writeback(pages[i]); - if (unlock_pages) + if (op & EXTENT_CLEAR_UNLOCK_PAGE) unlock_page(pages[i]); page_cache_release(pages[i]); } @@ -1476,14 +1581,17 @@ out: * range is found set. */ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int filled) + int bits, int filled, struct extent_state *cached) { struct extent_state *state = NULL; struct rb_node *node; int bitset = 0; spin_lock(&tree->lock); - node = tree_search(tree, start); + if (cached && cached->tree && cached->start == start) + node = &cached->rb_node; + else + node = tree_search(tree, start); while (node && start <= end) { state = rb_entry(node, struct extent_state, rb_node); @@ -1503,6 +1611,10 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, bitset = 0; break; } + + if (state->end == (u64)-1) + break; + start = state->end + 1; if (start > end) break; @@ -1526,7 +1638,7 @@ static int check_page_uptodate(struct extent_io_tree *tree, { u64 start = (u64)page->index << PAGE_CACHE_SHIFT; u64 end = start + PAGE_CACHE_SIZE - 1; - if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1)) + if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) SetPageUptodate(page); return 0; } @@ -1540,7 +1652,7 @@ static int check_page_locked(struct extent_io_tree *tree, { u64 start = (u64)page->index << PAGE_CACHE_SHIFT; u64 end = start + PAGE_CACHE_SIZE - 1; - if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0)) + if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) unlock_page(page); return 0; } @@ -1552,10 +1664,7 @@ static int check_page_locked(struct extent_io_tree *tree, static int check_page_writeback(struct extent_io_tree *tree, struct page *page) { - u64 start = (u64)page->index << PAGE_CACHE_SHIFT; - u64 end = start + PAGE_CACHE_SIZE - 1; - if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0)) - end_page_writeback(page); + end_page_writeback(page); return 0; } @@ -1613,13 +1722,11 @@ static void end_bio_extent_writepage(struct bio *bio, int err) } if (!uptodate) { - clear_extent_uptodate(tree, start, end, GFP_ATOMIC); + clear_extent_uptodate(tree, start, end, GFP_NOFS); ClearPageUptodate(page); SetPageError(page); } - clear_extent_writeback(tree, start, end, GFP_ATOMIC); - if (whole_page) end_page_writeback(page); else @@ -1983,7 +2090,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree, continue; } /* the get_extent function already copied into the page */ - if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) { + if (test_range_bit(tree, cur, cur_end, + EXTENT_UPTODATE, 1, NULL)) { check_page_uptodate(tree, page); unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); cur = cur + iosize; @@ -2078,6 +2186,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, u64 iosize; u64 unlock_start; sector_t sector; + struct extent_state *cached_state = NULL; struct extent_map *em; struct block_device *bdev; int ret; @@ -2124,6 +2233,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, delalloc_end = 0; page_started = 0; if (!epd->extent_locked) { + u64 delalloc_to_write = 0; /* * make sure the wbc mapping index is at least updated * to this page. @@ -2143,8 +2253,24 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, tree->ops->fill_delalloc(inode, page, delalloc_start, delalloc_end, &page_started, &nr_written); + /* + * delalloc_end is already one less than the total + * length, so we don't subtract one from + * PAGE_CACHE_SIZE + */ + delalloc_to_write += (delalloc_end - delalloc_start + + PAGE_CACHE_SIZE) >> + PAGE_CACHE_SHIFT; delalloc_start = delalloc_end + 1; } + if (wbc->nr_to_write < delalloc_to_write) { + int thresh = 8192; + + if (delalloc_to_write < thresh * 2) + thresh = delalloc_to_write; + wbc->nr_to_write = min_t(u64, delalloc_to_write, + thresh); + } /* did the fill delalloc function already unlock and start * the IO? @@ -2160,15 +2286,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, goto done_unlocked; } } - lock_extent(tree, start, page_end, GFP_NOFS); - - unlock_start = start; - if (tree->ops && tree->ops->writepage_start_hook) { ret = tree->ops->writepage_start_hook(page, start, page_end); if (ret == -EAGAIN) { - unlock_extent(tree, start, page_end, GFP_NOFS); redirty_page_for_writepage(wbc, page); update_nr_written(page, wbc, nr_written); unlock_page(page); @@ -2184,12 +2305,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, update_nr_written(page, wbc, nr_written + 1); end = page_end; - if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) - printk(KERN_ERR "btrfs delalloc bits after lock_extent\n"); - if (last_byte <= start) { - clear_extent_dirty(tree, start, page_end, GFP_NOFS); - unlock_extent(tree, start, page_end, GFP_NOFS); if (tree->ops && tree->ops->writepage_end_io_hook) tree->ops->writepage_end_io_hook(page, start, page_end, NULL, 1); @@ -2197,13 +2313,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, goto done; } - set_extent_uptodate(tree, start, page_end, GFP_NOFS); blocksize = inode->i_sb->s_blocksize; while (cur <= end) { if (cur >= last_byte) { - clear_extent_dirty(tree, cur, page_end, GFP_NOFS); - unlock_extent(tree, unlock_start, page_end, GFP_NOFS); if (tree->ops && tree->ops->writepage_end_io_hook) tree->ops->writepage_end_io_hook(page, cur, page_end, NULL, 1); @@ -2235,12 +2348,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, */ if (compressed || block_start == EXTENT_MAP_HOLE || block_start == EXTENT_MAP_INLINE) { - clear_extent_dirty(tree, cur, - cur + iosize - 1, GFP_NOFS); - - unlock_extent(tree, unlock_start, cur + iosize - 1, - GFP_NOFS); - /* * end_io notification does not happen here for * compressed extents @@ -2265,13 +2372,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, } /* leave this out until we have a page_mkwrite call */ if (0 && !test_range_bit(tree, cur, cur + iosize - 1, - EXTENT_DIRTY, 0)) { + EXTENT_DIRTY, 0, NULL)) { cur = cur + iosize; pg_offset += iosize; continue; } - clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS); if (tree->ops && tree->ops->writepage_io_hook) { ret = tree->ops->writepage_io_hook(page, cur, cur + iosize - 1); @@ -2309,12 +2415,12 @@ done: set_page_writeback(page); end_page_writeback(page); } - if (unlock_start <= page_end) - unlock_extent(tree, unlock_start, page_end, GFP_NOFS); unlock_page(page); done_unlocked: + /* drop our reference on any cached states */ + free_extent_state(cached_state); return 0; } @@ -2339,9 +2445,9 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, writepage_t writepage, void *data, void (*flush_fn)(void *)) { - struct backing_dev_info *bdi = mapping->backing_dev_info; int ret = 0; int done = 0; + int nr_to_write_done = 0; struct pagevec pvec; int nr_pages; pgoff_t index; @@ -2361,7 +2467,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, scanned = 1; } retry: - while (!done && (index <= end) && + while (!done && !nr_to_write_done && (index <= end) && (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY, min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { @@ -2412,12 +2518,15 @@ retry: unlock_page(page); ret = 0; } - if (ret || wbc->nr_to_write <= 0) - done = 1; - if (wbc->nonblocking && bdi_write_congested(bdi)) { - wbc->encountered_congestion = 1; + if (ret) done = 1; - } + + /* + * the filesystem may choose to bump up nr_to_write. + * We have to make sure to honor the new nr_to_write + * at any time + */ + nr_to_write_done = wbc->nr_to_write <= 0; } pagevec_release(&pvec); cond_resched(); @@ -2604,10 +2713,11 @@ int extent_invalidatepage(struct extent_io_tree *tree, return 0; lock_extent(tree, start, end, GFP_NOFS); - wait_on_extent_writeback(tree, start, end); + wait_on_page_writeback(page); clear_extent_bit(tree, start, end, - EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC, - 1, 1, GFP_NOFS); + EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING, + 1, 1, NULL, GFP_NOFS); return 0; } @@ -2687,7 +2797,7 @@ int extent_prepare_write(struct extent_io_tree *tree, !isnew && !PageUptodate(page) && (block_off_end > to || block_off_start < from) && !test_range_bit(tree, block_start, cur_end, - EXTENT_UPTODATE, 1)) { + EXTENT_UPTODATE, 1, NULL)) { u64 sector; u64 extent_offset = block_start - em->start; size_t iosize; @@ -2701,7 +2811,7 @@ int extent_prepare_write(struct extent_io_tree *tree, */ set_extent_bit(tree, block_start, block_start + iosize - 1, - EXTENT_LOCKED, 0, NULL, GFP_NOFS); + EXTENT_LOCKED, 0, NULL, NULL, GFP_NOFS); ret = submit_extent_page(READ, tree, page, sector, iosize, page_offset, em->bdev, NULL, 1, @@ -2742,13 +2852,18 @@ int try_release_extent_state(struct extent_map_tree *map, int ret = 1; if (test_range_bit(tree, start, end, - EXTENT_IOBITS | EXTENT_ORDERED, 0)) + EXTENT_IOBITS, 0, NULL)) ret = 0; else { if ((mask & GFP_NOFS) == GFP_NOFS) mask = GFP_NOFS; - clear_extent_bit(tree, start, end, EXTENT_UPTODATE, - 1, 1, mask); + /* + * at this point we can safely clear everything except the + * locked bit and the nodatasum bit + */ + clear_extent_bit(tree, start, end, + ~(EXTENT_LOCKED | EXTENT_NODATASUM), + 0, 0, NULL, mask); } return ret; } @@ -2771,29 +2886,28 @@ int try_release_extent_mapping(struct extent_map_tree *map, u64 len; while (start <= end) { len = end - start + 1; - spin_lock(&map->lock); + write_lock(&map->lock); em = lookup_extent_mapping(map, start, len); if (!em || IS_ERR(em)) { - spin_unlock(&map->lock); + write_unlock(&map->lock); break; } if (test_bit(EXTENT_FLAG_PINNED, &em->flags) || em->start != start) { - spin_unlock(&map->lock); + write_unlock(&map->lock); free_extent_map(em); break; } if (!test_range_bit(tree, em->start, extent_map_end(em) - 1, - EXTENT_LOCKED | EXTENT_WRITEBACK | - EXTENT_ORDERED, - 0)) { + EXTENT_LOCKED | EXTENT_WRITEBACK, + 0, NULL)) { remove_extent_mapping(map, em); /* once for the rb tree */ free_extent_map(em); } start = extent_map_end(em); - spin_unlock(&map->lock); + write_unlock(&map->lock); /* once for us */ free_extent_map(em); @@ -3203,7 +3317,7 @@ int extent_range_uptodate(struct extent_io_tree *tree, int uptodate; unsigned long index; - ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1); + ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL); if (ret) return 1; while (start <= end) { @@ -3233,7 +3347,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree, return 1; ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, - EXTENT_UPTODATE, 1); + EXTENT_UPTODATE, 1, NULL); if (ret) return ret; @@ -3269,7 +3383,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, return 0; if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, - EXTENT_UPTODATE, 1)) { + EXTENT_UPTODATE, 1, NULL)) { return 0; } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 5bc20ab..36de250 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -13,10 +13,9 @@ #define EXTENT_DEFRAG (1 << 6) #define EXTENT_DEFRAG_DONE (1 << 7) #define EXTENT_BUFFER_FILLED (1 << 8) -#define EXTENT_ORDERED (1 << 9) -#define EXTENT_ORDERED_METADATA (1 << 10) -#define EXTENT_BOUNDARY (1 << 11) -#define EXTENT_NODATASUM (1 << 12) +#define EXTENT_BOUNDARY (1 << 9) +#define EXTENT_NODATASUM (1 << 10) +#define EXTENT_DO_ACCOUNTING (1 << 11) #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) /* flags for bio submission */ @@ -27,6 +26,16 @@ #define EXTENT_BUFFER_BLOCKING 1 #define EXTENT_BUFFER_DIRTY 2 +/* these are flags for extent_clear_unlock_delalloc */ +#define EXTENT_CLEAR_UNLOCK_PAGE 0x1 +#define EXTENT_CLEAR_UNLOCK 0x2 +#define EXTENT_CLEAR_DELALLOC 0x4 +#define EXTENT_CLEAR_DIRTY 0x8 +#define EXTENT_SET_WRITEBACK 0x10 +#define EXTENT_END_WRITEBACK 0x20 +#define EXTENT_SET_PRIVATE2 0x40 +#define EXTENT_CLEAR_ACCOUNTING 0x80 + /* * page->private values. Every page that is controlled by the extent * map has page->private set to one. @@ -62,8 +71,13 @@ struct extent_io_ops { struct extent_state *state, int uptodate); int (*set_bit_hook)(struct inode *inode, u64 start, u64 end, unsigned long old, unsigned long bits); - int (*clear_bit_hook)(struct inode *inode, u64 start, u64 end, - unsigned long old, unsigned long bits); + int (*clear_bit_hook)(struct inode *inode, struct extent_state *state, + unsigned long bits); + int (*merge_extent_hook)(struct inode *inode, + struct extent_state *new, + struct extent_state *other); + int (*split_extent_hook)(struct inode *inode, + struct extent_state *orig, u64 split); int (*write_cache_pages_lock_hook)(struct page *page); }; @@ -81,10 +95,14 @@ struct extent_state { u64 start; u64 end; /* inclusive */ struct rb_node rb_node; + + /* ADD NEW ELEMENTS AFTER THIS */ struct extent_io_tree *tree; wait_queue_head_t wq; atomic_t refs; unsigned long state; + u64 split_start; + u64 split_end; /* for use by the FS */ u64 private; @@ -142,6 +160,8 @@ int try_release_extent_state(struct extent_map_tree *map, struct extent_io_tree *tree, struct page *page, gfp_t mask); int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); +int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + int bits, struct extent_state **cached, gfp_t mask); int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); @@ -155,11 +175,12 @@ u64 count_range_bits(struct extent_io_tree *tree, u64 max_bytes, unsigned long bits); int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int filled); + int bits, int filled, struct extent_state *cached_state); int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, int bits, gfp_t mask); int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int wake, int delete, gfp_t mask); + int bits, int wake, int delete, struct extent_state **cached, + gfp_t mask); int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, int bits, gfp_t mask); int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, @@ -278,9 +299,5 @@ int extent_range_uptodate(struct extent_io_tree *tree, int extent_clear_unlock_delalloc(struct inode *inode, struct extent_io_tree *tree, u64 start, u64 end, struct page *locked_page, - int unlock_page, - int clear_unlock, - int clear_delalloc, int clear_dirty, - int set_writeback, - int end_writeback); + unsigned long op); #endif diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 30c9365..2c726b7 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -36,7 +36,7 @@ void extent_map_exit(void) void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask) { tree->map.rb_node = NULL; - spin_lock_init(&tree->lock); + rwlock_init(&tree->lock); } /** @@ -198,6 +198,56 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next) return 0; } +int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len) +{ + int ret = 0; + struct extent_map *merge = NULL; + struct rb_node *rb; + struct extent_map *em; + + write_lock(&tree->lock); + em = lookup_extent_mapping(tree, start, len); + + WARN_ON(em->start != start || !em); + + if (!em) + goto out; + + clear_bit(EXTENT_FLAG_PINNED, &em->flags); + + if (em->start != 0) { + rb = rb_prev(&em->rb_node); + if (rb) + merge = rb_entry(rb, struct extent_map, rb_node); + if (rb && mergable_maps(merge, em)) { + em->start = merge->start; + em->len += merge->len; + em->block_len += merge->block_len; + em->block_start = merge->block_start; + merge->in_tree = 0; + rb_erase(&merge->rb_node, &tree->map); + free_extent_map(merge); + } + } + + rb = rb_next(&em->rb_node); + if (rb) + merge = rb_entry(rb, struct extent_map, rb_node); + if (rb && mergable_maps(em, merge)) { + em->len += merge->len; + em->block_len += merge->len; + rb_erase(&merge->rb_node, &tree->map); + merge->in_tree = 0; + free_extent_map(merge); + } + + free_extent_map(em); +out: + write_unlock(&tree->lock); + return ret; + +} + /** * add_extent_mapping - add new extent map to the extent tree * @tree: tree to insert new map in @@ -222,7 +272,6 @@ int add_extent_mapping(struct extent_map_tree *tree, ret = -EEXIST; goto out; } - assert_spin_locked(&tree->lock); rb = tree_insert(&tree->map, em->start, &em->rb_node); if (rb) { ret = -EEXIST; @@ -285,7 +334,6 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, struct rb_node *next = NULL; u64 end = range_end(start, len); - assert_spin_locked(&tree->lock); rb_node = __tree_search(&tree->map, start, &prev, &next); if (!rb_node && prev) { em = rb_entry(prev, struct extent_map, rb_node); @@ -319,6 +367,54 @@ out: } /** + * search_extent_mapping - find a nearby extent map + * @tree: tree to lookup in + * @start: byte offset to start the search + * @len: length of the lookup range + * + * Find and return the first extent_map struct in @tree that intersects the + * [start, len] range. + * + * If one can't be found, any nearby extent may be returned + */ +struct extent_map *search_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len) +{ + struct extent_map *em; + struct rb_node *rb_node; + struct rb_node *prev = NULL; + struct rb_node *next = NULL; + + rb_node = __tree_search(&tree->map, start, &prev, &next); + if (!rb_node && prev) { + em = rb_entry(prev, struct extent_map, rb_node); + goto found; + } + if (!rb_node && next) { + em = rb_entry(next, struct extent_map, rb_node); + goto found; + } + if (!rb_node) { + em = NULL; + goto out; + } + if (IS_ERR(rb_node)) { + em = ERR_PTR(PTR_ERR(rb_node)); + goto out; + } + em = rb_entry(rb_node, struct extent_map, rb_node); + goto found; + + em = NULL; + goto out; + +found: + atomic_inc(&em->refs); +out: + return em; +} + +/** * remove_extent_mapping - removes an extent_map from the extent tree * @tree: extent tree to remove from * @em: extent map beeing removed @@ -331,7 +427,6 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) int ret = 0; WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags)); - assert_spin_locked(&tree->lock); rb_erase(&em->rb_node, &tree->map); em->in_tree = 0; return ret; diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index fb6eeef..ab6d74b 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -31,7 +31,7 @@ struct extent_map { struct extent_map_tree { struct rb_root map; - spinlock_t lock; + rwlock_t lock; }; static inline u64 extent_map_end(struct extent_map *em) @@ -59,4 +59,7 @@ struct extent_map *alloc_extent_map(gfp_t mask); void free_extent_map(struct extent_map *em); int __init extent_map_init(void); void extent_map_exit(void); +int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len); +struct extent_map *search_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len); #endif diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 4b83397..06550af 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -112,8 +112,6 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans, int err = 0; int i; struct inode *inode = fdentry(file)->d_inode; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - u64 hint_byte; u64 num_bytes; u64 start_pos; u64 end_of_last_block; @@ -125,23 +123,10 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans, root->sectorsize - 1) & ~((u64)root->sectorsize - 1); end_of_last_block = start_pos + num_bytes - 1; + err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block); + if (err) + return err; - lock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS); - trans = btrfs_join_transaction(root, 1); - if (!trans) { - err = -ENOMEM; - goto out_unlock; - } - btrfs_set_trans_block_group(trans, inode); - hint_byte = 0; - - set_extent_uptodate(io_tree, start_pos, end_of_last_block, GFP_NOFS); - - /* check for reserved extents on each page, we don't want - * to reset the delalloc bit on things that already have - * extents reserved. - */ - btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block); for (i = 0; i < num_pages; i++) { struct page *p = pages[i]; SetPageUptodate(p); @@ -155,9 +140,6 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans, * at this time. */ } - err = btrfs_end_transaction(trans, root); -out_unlock: - unlock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS); return err; } @@ -189,18 +171,18 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, if (!split2) split2 = alloc_extent_map(GFP_NOFS); - spin_lock(&em_tree->lock); + write_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, len); if (!em) { - spin_unlock(&em_tree->lock); + write_unlock(&em_tree->lock); break; } flags = em->flags; if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) { - spin_unlock(&em_tree->lock); if (em->start <= start && (!testend || em->start + em->len >= start + len)) { free_extent_map(em); + write_unlock(&em_tree->lock); break; } if (start < em->start) { @@ -210,6 +192,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, start = em->start + em->len; } free_extent_map(em); + write_unlock(&em_tree->lock); continue; } compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); @@ -260,7 +243,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, free_extent_map(split); split = NULL; } - spin_unlock(&em_tree->lock); + write_unlock(&em_tree->lock); /* once for us */ free_extent_map(em); @@ -289,7 +272,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, u64 start, u64 end, u64 locked_end, - u64 inline_limit, u64 *hint_byte) + u64 inline_limit, u64 *hint_byte, int drop_cache) { u64 extent_end = 0; u64 search_start = start; @@ -314,7 +297,8 @@ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans, int ret; inline_limit = 0; - btrfs_drop_extent_cache(inode, start, end - 1, 0); + if (drop_cache) + btrfs_drop_extent_cache(inode, start, end - 1, 0); path = btrfs_alloc_path(); if (!path) @@ -894,7 +878,8 @@ again: btrfs_put_ordered_extent(ordered); clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos, - last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC, + last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING, GFP_NOFS); unlock_extent(&BTRFS_I(inode)->io_tree, start_pos, last_pos - 1, GFP_NOFS); @@ -936,21 +921,35 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, start_pos = pos; vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); + + /* do the reserve before the mutex lock in case we have to do some + * flushing. We wouldn't deadlock, but this is more polite. + */ + err = btrfs_reserve_metadata_for_delalloc(root, inode, 1); + if (err) + goto out_nolock; + + mutex_lock(&inode->i_mutex); + current->backing_dev_info = inode->i_mapping->backing_dev_info; err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); if (err) - goto out_nolock; + goto out; + if (count == 0) - goto out_nolock; + goto out; err = file_remove_suid(file); if (err) - goto out_nolock; + goto out; + file_update_time(file); pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); - mutex_lock(&inode->i_mutex); + /* generic_write_checks can change our pos */ + start_pos = pos; + BTRFS_I(inode)->sequence++; first_index = pos >> PAGE_CACHE_SHIFT; last_index = (pos + count) >> PAGE_CACHE_SHIFT; @@ -1024,9 +1023,8 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, } if (will_write) { - btrfs_fdatawrite_range(inode->i_mapping, pos, - pos + write_bytes - 1, - WB_SYNC_ALL); + filemap_fdatawrite_range(inode->i_mapping, pos, + pos + write_bytes - 1); } else { balance_dirty_pages_ratelimited_nr(inode->i_mapping, num_pages); @@ -1047,6 +1045,7 @@ out: mutex_unlock(&inode->i_mutex); if (ret) err = ret; + btrfs_unreserve_metadata_for_delalloc(root, inode, 1); out_nolock: kfree(pages); @@ -1087,8 +1086,10 @@ out_nolock: btrfs_end_transaction(trans, root); else btrfs_commit_transaction(trans, root); - } else { + } else if (ret != BTRFS_NO_LOG_SYNC) { btrfs_commit_transaction(trans, root); + } else { + btrfs_end_transaction(trans, root); } } if (file->f_flags & O_DIRECT) { @@ -1138,6 +1139,13 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) int ret = 0; struct btrfs_trans_handle *trans; + + /* we wait first, since the writeback may change the inode */ + root->log_batch++; + /* the VFS called filemap_fdatawrite for us */ + btrfs_wait_ordered_range(inode, 0, (u64)-1); + root->log_batch++; + /* * check the transaction that last modified this inode * and see if its already been committed @@ -1145,6 +1153,11 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) if (!BTRFS_I(inode)->last_trans) goto out; + /* + * if the last transaction that changed this file was before + * the current transaction, we can bail out now without any + * syncing + */ mutex_lock(&root->fs_info->trans_mutex); if (BTRFS_I(inode)->last_trans <= root->fs_info->last_trans_committed) { @@ -1154,13 +1167,6 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) } mutex_unlock(&root->fs_info->trans_mutex); - root->log_batch++; - filemap_fdatawrite(inode->i_mapping); - btrfs_wait_ordered_range(inode, 0, (u64)-1); - root->log_batch++; - - if (datasync && !(inode->i_state & I_DIRTY_PAGES)) - goto out; /* * ok we haven't committed the transaction yet, lets do a commit */ @@ -1189,21 +1195,25 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) */ mutex_unlock(&dentry->d_inode->i_mutex); - if (ret > 0) { - ret = btrfs_commit_transaction(trans, root); - } else { - ret = btrfs_sync_log(trans, root); - if (ret == 0) - ret = btrfs_end_transaction(trans, root); - else + if (ret != BTRFS_NO_LOG_SYNC) { + if (ret > 0) { ret = btrfs_commit_transaction(trans, root); + } else { + ret = btrfs_sync_log(trans, root); + if (ret == 0) + ret = btrfs_end_transaction(trans, root); + else + ret = btrfs_commit_transaction(trans, root); + } + } else { + ret = btrfs_end_transaction(trans, root); } mutex_lock(&dentry->d_inode->i_mutex); out: return ret > 0 ? EIO : ret; } -static struct vm_operations_struct btrfs_file_vm_ops = { +static const struct vm_operations_struct btrfs_file_vm_ops = { .fault = filemap_fault, .page_mkwrite = btrfs_page_mkwrite, }; @@ -1215,7 +1225,7 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) return 0; } -struct file_operations btrfs_file_operations = { +const struct file_operations btrfs_file_operations = { .llseek = generic_file_llseek, .read = do_sync_read, .aio_read = generic_file_aio_read, diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 5edcee3..5c2caad 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -259,7 +259,9 @@ static int link_free_space(struct btrfs_block_group_cache *block_group, static void recalculate_thresholds(struct btrfs_block_group_cache *block_group) { - u64 max_bytes, possible_bytes; + u64 max_bytes; + u64 bitmap_bytes; + u64 extent_bytes; /* * The goal is to keep the total amount of memory used per 1gb of space @@ -269,22 +271,27 @@ static void recalculate_thresholds(struct btrfs_block_group_cache *block_group) max_bytes = MAX_CACHE_BYTES_PER_GIG * (div64_u64(block_group->key.offset, 1024 * 1024 * 1024)); - possible_bytes = (block_group->total_bitmaps * PAGE_CACHE_SIZE) + - (sizeof(struct btrfs_free_space) * - block_group->extents_thresh); + /* + * we want to account for 1 more bitmap than what we have so we can make + * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as + * we add more bitmaps. + */ + bitmap_bytes = (block_group->total_bitmaps + 1) * PAGE_CACHE_SIZE; - if (possible_bytes > max_bytes) { - int extent_bytes = max_bytes - - (block_group->total_bitmaps * PAGE_CACHE_SIZE); + if (bitmap_bytes >= max_bytes) { + block_group->extents_thresh = 0; + return; + } - if (extent_bytes <= 0) { - block_group->extents_thresh = 0; - return; - } + /* + * we want the extent entry threshold to always be at most 1/2 the maxw + * bytes we can have, or whatever is less than that. + */ + extent_bytes = max_bytes - bitmap_bytes; + extent_bytes = min_t(u64, extent_bytes, div64_u64(max_bytes, 2)); - block_group->extents_thresh = extent_bytes / - (sizeof(struct btrfs_free_space)); - } + block_group->extents_thresh = + div64_u64(extent_bytes, (sizeof(struct btrfs_free_space))); } static void bitmap_clear_bits(struct btrfs_block_group_cache *block_group, @@ -403,6 +410,7 @@ static void add_new_bitmap(struct btrfs_block_group_cache *block_group, BUG_ON(block_group->total_bitmaps >= max_bitmaps); info->offset = offset_to_bitmap(block_group, offset); + info->bytes = 0; link_free_space(block_group, info); block_group->total_bitmaps++; diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 6b627c6..72ce3c1 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -149,6 +149,8 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, ptr = (unsigned long)(ref + 1); ret = 0; } else if (ret < 0) { + if (ret == -EOVERFLOW) + ret = -EMLINK; goto out; } else { ref = btrfs_item_ptr(path->nodes[0], path->slots[0], @@ -177,8 +179,6 @@ int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(struct btrfs_inode_item)); - if (ret == 0 && objectid > root->highest_inode) - root->highest_inode = objectid; return ret; } diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 9abbced..c56eb59 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -43,9 +43,10 @@ int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid) slot = path->slots[0] - 1; l = path->nodes[0]; btrfs_item_key_to_cpu(l, &found_key, slot); - *objectid = found_key.objectid; + *objectid = max_t(u64, found_key.objectid, + BTRFS_FIRST_FREE_OBJECTID - 1); } else { - *objectid = BTRFS_FIRST_FREE_OBJECTID; + *objectid = BTRFS_FIRST_FREE_OBJECTID - 1; } ret = 0; error: @@ -53,91 +54,27 @@ error: return ret; } -/* - * walks the btree of allocated inodes and find a hole. - */ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 dirid, u64 *objectid) { - struct btrfs_path *path; - struct btrfs_key key; int ret; - int slot = 0; - u64 last_ino = 0; - int start_found; - struct extent_buffer *l; - struct btrfs_key search_key; - u64 search_start = dirid; - mutex_lock(&root->objectid_mutex); - if (root->last_inode_alloc >= BTRFS_FIRST_FREE_OBJECTID && - root->last_inode_alloc < BTRFS_LAST_FREE_OBJECTID) { - *objectid = ++root->last_inode_alloc; - mutex_unlock(&root->objectid_mutex); - return 0; - } - path = btrfs_alloc_path(); - BUG_ON(!path); - search_start = max(search_start, (u64)BTRFS_FIRST_FREE_OBJECTID); - search_key.objectid = search_start; - search_key.type = 0; - search_key.offset = 0; - - start_found = 0; - ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0); - if (ret < 0) - goto error; - while (1) { - l = path->nodes[0]; - slot = path->slots[0]; - if (slot >= btrfs_header_nritems(l)) { - ret = btrfs_next_leaf(root, path); - if (ret == 0) - continue; - if (ret < 0) - goto error; - if (!start_found) { - *objectid = search_start; - start_found = 1; - goto found; - } - *objectid = last_ino > search_start ? - last_ino : search_start; - goto found; - } - btrfs_item_key_to_cpu(l, &key, slot); - if (key.objectid >= search_start) { - if (start_found) { - if (last_ino < search_start) - last_ino = search_start; - if (key.objectid > last_ino) { - *objectid = last_ino; - goto found; - } - } else if (key.objectid > search_start) { - *objectid = search_start; - goto found; - } - } - if (key.objectid >= BTRFS_LAST_FREE_OBJECTID) - break; + if (unlikely(root->highest_objectid < BTRFS_FIRST_FREE_OBJECTID)) { + ret = btrfs_find_highest_inode(root, &root->highest_objectid); + if (ret) + goto out; + } - start_found = 1; - last_ino = key.objectid + 1; - path->slots[0]++; + if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) { + ret = -ENOSPC; + goto out; } - BUG_ON(1); -found: - btrfs_release_path(root, path); - btrfs_free_path(path); - BUG_ON(*objectid < search_start); - mutex_unlock(&root->objectid_mutex); - return 0; -error: - btrfs_release_path(root, path); - btrfs_free_path(path); + + *objectid = ++root->highest_objectid; + ret = 0; +out: mutex_unlock(&root->objectid_mutex); return ret; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 9096fd0..dae12dc 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -62,7 +62,7 @@ static const struct inode_operations btrfs_special_inode_operations; static const struct inode_operations btrfs_file_inode_operations; static const struct address_space_operations btrfs_aops; static const struct address_space_operations btrfs_symlink_aops; -static struct file_operations btrfs_dir_file_operations; +static const struct file_operations btrfs_dir_file_operations; static struct extent_io_ops btrfs_extent_io_ops; static struct kmem_cache *btrfs_inode_cachep; @@ -231,7 +231,8 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, } ret = btrfs_drop_extents(trans, root, inode, start, - aligned_end, aligned_end, start, &hint_byte); + aligned_end, aligned_end, start, + &hint_byte, 1); BUG_ON(ret); if (isize > actual_end) @@ -240,7 +241,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, inline_len, compressed_size, compressed_pages); BUG_ON(ret); - btrfs_drop_extent_cache(inode, start, aligned_end, 0); + btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); return 0; } @@ -423,9 +424,12 @@ again: * and free up our temp pages. */ extent_clear_unlock_delalloc(inode, - &BTRFS_I(inode)->io_tree, - start, end, NULL, 1, 0, - 0, 1, 1, 1); + &BTRFS_I(inode)->io_tree, + start, end, NULL, + EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | + EXTENT_CLEAR_DELALLOC | + EXTENT_CLEAR_ACCOUNTING | + EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK); ret = 0; goto free_pages_out; } @@ -611,9 +615,9 @@ static noinline int submit_compressed_extents(struct inode *inode, set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); while (1) { - spin_lock(&em_tree->lock); + write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); - spin_unlock(&em_tree->lock); + write_unlock(&em_tree->lock); if (ret != -EEXIST) { free_extent_map(em); break; @@ -636,11 +640,14 @@ static noinline int submit_compressed_extents(struct inode *inode, * clear dirty, set writeback and unlock the pages. */ extent_clear_unlock_delalloc(inode, - &BTRFS_I(inode)->io_tree, - async_extent->start, - async_extent->start + - async_extent->ram_size - 1, - NULL, 1, 1, 0, 1, 1, 0); + &BTRFS_I(inode)->io_tree, + async_extent->start, + async_extent->start + + async_extent->ram_size - 1, + NULL, EXTENT_CLEAR_UNLOCK_PAGE | + EXTENT_CLEAR_UNLOCK | + EXTENT_CLEAR_DELALLOC | + EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK); ret = btrfs_submit_compressed_write(inode, async_extent->start, @@ -711,9 +718,15 @@ static noinline int cow_file_range(struct inode *inode, start, end, 0, NULL); if (ret == 0) { extent_clear_unlock_delalloc(inode, - &BTRFS_I(inode)->io_tree, - start, end, NULL, 1, 1, - 1, 1, 1, 1); + &BTRFS_I(inode)->io_tree, + start, end, NULL, + EXTENT_CLEAR_UNLOCK_PAGE | + EXTENT_CLEAR_UNLOCK | + EXTENT_CLEAR_DELALLOC | + EXTENT_CLEAR_ACCOUNTING | + EXTENT_CLEAR_DIRTY | + EXTENT_SET_WRITEBACK | + EXTENT_END_WRITEBACK); *nr_written = *nr_written + (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE; *page_started = 1; @@ -725,9 +738,20 @@ static noinline int cow_file_range(struct inode *inode, BUG_ON(disk_num_bytes > btrfs_super_total_bytes(&root->fs_info->super_copy)); + + read_lock(&BTRFS_I(inode)->extent_tree.lock); + em = search_extent_mapping(&BTRFS_I(inode)->extent_tree, + start, num_bytes); + if (em) { + alloc_hint = em->block_start; + free_extent_map(em); + } + read_unlock(&BTRFS_I(inode)->extent_tree.lock); btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); while (disk_num_bytes > 0) { + unsigned long op; + cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent); ret = btrfs_reserve_extent(trans, root, cur_alloc_size, root->sectorsize, 0, alloc_hint, @@ -737,7 +761,6 @@ static noinline int cow_file_range(struct inode *inode, em = alloc_extent_map(GFP_NOFS); em->start = start; em->orig_start = em->start; - ram_size = ins.offset; em->len = ins.offset; @@ -747,9 +770,9 @@ static noinline int cow_file_range(struct inode *inode, set_bit(EXTENT_FLAG_PINNED, &em->flags); while (1) { - spin_lock(&em_tree->lock); + write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); - spin_unlock(&em_tree->lock); + write_unlock(&em_tree->lock); if (ret != -EEXIST) { free_extent_map(em); break; @@ -776,11 +799,17 @@ static noinline int cow_file_range(struct inode *inode, /* we're not doing compressed IO, don't unlock the first * page (which the caller expects to stay locked), don't * clear any dirty bits and don't set any writeback bits + * + * Do set the Private2 bit so we know this page was properly + * setup for writepage */ + op = unlock ? EXTENT_CLEAR_UNLOCK_PAGE : 0; + op |= EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | + EXTENT_SET_PRIVATE2; + extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, start, start + ram_size - 1, - locked_page, unlock, 1, - 1, 0, 0, 0); + locked_page, op); disk_num_bytes -= cur_alloc_size; num_bytes -= cur_alloc_size; alloc_hint = ins.objectid + ins.offset; @@ -852,8 +881,8 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, u64 cur_end; int limit = 10 * 1024 * 1042; - clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED | - EXTENT_DELALLOC, 1, 0, GFP_NOFS); + clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED, + 1, 0, NULL, GFP_NOFS); while (start < end) { async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); async_cow->inode = inode; @@ -994,6 +1023,7 @@ next_slot: if (found_key.offset > cur_offset) { extent_end = found_key.offset; + extent_type = 0; goto out_check; } @@ -1080,9 +1110,9 @@ out_check: em->bdev = root->fs_info->fs_devices->latest_bdev; set_bit(EXTENT_FLAG_PINNED, &em->flags); while (1) { - spin_lock(&em_tree->lock); + write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); - spin_unlock(&em_tree->lock); + write_unlock(&em_tree->lock); if (ret != -EEXIST) { free_extent_map(em); break; @@ -1100,8 +1130,10 @@ out_check: BUG_ON(ret); extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, - cur_offset, cur_offset + num_bytes - 1, - locked_page, 1, 1, 1, 0, 0, 0); + cur_offset, cur_offset + num_bytes - 1, + locked_page, EXTENT_CLEAR_UNLOCK_PAGE | + EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | + EXTENT_SET_PRIVATE2); cur_offset = extent_end; if (cur_offset > end) break; @@ -1147,6 +1179,89 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, return ret; } +static int btrfs_split_extent_hook(struct inode *inode, + struct extent_state *orig, u64 split) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 size; + + if (!(orig->state & EXTENT_DELALLOC)) + return 0; + + size = orig->end - orig->start + 1; + if (size > root->fs_info->max_extent) { + u64 num_extents; + u64 new_size; + + new_size = orig->end - split + 1; + num_extents = div64_u64(size + root->fs_info->max_extent - 1, + root->fs_info->max_extent); + + /* + * if we break a large extent up then leave oustanding_extents + * be, since we've already accounted for the large extent. + */ + if (div64_u64(new_size + root->fs_info->max_extent - 1, + root->fs_info->max_extent) < num_extents) + return 0; + } + + spin_lock(&BTRFS_I(inode)->accounting_lock); + BTRFS_I(inode)->outstanding_extents++; + spin_unlock(&BTRFS_I(inode)->accounting_lock); + + return 0; +} + +/* + * extent_io.c merge_extent_hook, used to track merged delayed allocation + * extents so we can keep track of new extents that are just merged onto old + * extents, such as when we are doing sequential writes, so we can properly + * account for the metadata space we'll need. + */ +static int btrfs_merge_extent_hook(struct inode *inode, + struct extent_state *new, + struct extent_state *other) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 new_size, old_size; + u64 num_extents; + + /* not delalloc, ignore it */ + if (!(other->state & EXTENT_DELALLOC)) + return 0; + + old_size = other->end - other->start + 1; + if (new->start < other->start) + new_size = other->end - new->start + 1; + else + new_size = new->end - other->start + 1; + + /* we're not bigger than the max, unreserve the space and go */ + if (new_size <= root->fs_info->max_extent) { + spin_lock(&BTRFS_I(inode)->accounting_lock); + BTRFS_I(inode)->outstanding_extents--; + spin_unlock(&BTRFS_I(inode)->accounting_lock); + return 0; + } + + /* + * If we grew by another max_extent, just return, we want to keep that + * reserved amount. + */ + num_extents = div64_u64(old_size + root->fs_info->max_extent - 1, + root->fs_info->max_extent); + if (div64_u64(new_size + root->fs_info->max_extent - 1, + root->fs_info->max_extent) > num_extents) + return 0; + + spin_lock(&BTRFS_I(inode)->accounting_lock); + BTRFS_I(inode)->outstanding_extents--; + spin_unlock(&BTRFS_I(inode)->accounting_lock); + + return 0; +} + /* * extent_io.c set_bit_hook, used to track delayed allocation * bytes in this file, and to maintain the list of inodes that @@ -1155,6 +1270,7 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, unsigned long old, unsigned long bits) { + /* * set_bit and clear bit hooks normally require _irqsave/restore * but in this case, we are only testeing for the DELALLOC @@ -1162,6 +1278,10 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, */ if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; + + spin_lock(&BTRFS_I(inode)->accounting_lock); + BTRFS_I(inode)->outstanding_extents++; + spin_unlock(&BTRFS_I(inode)->accounting_lock); btrfs_delalloc_reserve_space(root, inode, end - start + 1); spin_lock(&root->fs_info->delalloc_lock); BTRFS_I(inode)->delalloc_bytes += end - start + 1; @@ -1178,22 +1298,31 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, /* * extent_io.c clear_bit_hook, see set_bit_hook for why */ -static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end, - unsigned long old, unsigned long bits) +static int btrfs_clear_bit_hook(struct inode *inode, + struct extent_state *state, unsigned long bits) { /* * set_bit and clear bit hooks normally require _irqsave/restore * but in this case, we are only testeing for the DELALLOC * bit, which is only set or cleared with irqs on */ - if ((old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { + if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; + if (bits & EXTENT_DO_ACCOUNTING) { + spin_lock(&BTRFS_I(inode)->accounting_lock); + BTRFS_I(inode)->outstanding_extents--; + spin_unlock(&BTRFS_I(inode)->accounting_lock); + btrfs_unreserve_metadata_for_delalloc(root, inode, 1); + } + spin_lock(&root->fs_info->delalloc_lock); - if (end - start + 1 > root->fs_info->delalloc_bytes) { + if (state->end - state->start + 1 > + root->fs_info->delalloc_bytes) { printk(KERN_INFO "btrfs warning: delalloc account " "%llu %llu\n", - (unsigned long long)end - start + 1, + (unsigned long long) + state->end - state->start + 1, (unsigned long long) root->fs_info->delalloc_bytes); btrfs_delalloc_free_space(root, inode, (u64)-1); @@ -1201,9 +1330,12 @@ static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end, BTRFS_I(inode)->delalloc_bytes = 0; } else { btrfs_delalloc_free_space(root, inode, - end - start + 1); - root->fs_info->delalloc_bytes -= end - start + 1; - BTRFS_I(inode)->delalloc_bytes -= end - start + 1; + state->end - + state->start + 1); + root->fs_info->delalloc_bytes -= state->end - + state->start + 1; + BTRFS_I(inode)->delalloc_bytes -= state->end - + state->start + 1; } if (BTRFS_I(inode)->delalloc_bytes == 0 && !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { @@ -1374,10 +1506,8 @@ again: lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS); /* already ordered? We're done */ - if (test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, - EXTENT_ORDERED, 0)) { + if (PagePrivate2(page)) goto out; - } ordered = btrfs_lookup_ordered_extent(inode, page_start); if (ordered) { @@ -1413,11 +1543,9 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) struct inode *inode = page->mapping->host; struct btrfs_writepage_fixup *fixup; struct btrfs_root *root = BTRFS_I(inode)->root; - int ret; - ret = test_range_bit(&BTRFS_I(inode)->io_tree, start, end, - EXTENT_ORDERED, 0); - if (ret) + /* this page is properly in the ordered list */ + if (TestClearPagePrivate2(page)) return 0; if (PageChecked(page)) @@ -1455,9 +1583,19 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, BUG_ON(!path); path->leave_spinning = 1; + + /* + * we may be replacing one extent in the tree with another. + * The new extent is pinned in the extent map, and we don't want + * to drop it from the cache until it is completely in the btree. + * + * So, tell btrfs_drop_extents to leave this extent in the cache. + * the caller is expected to unpin it and allow it to be merged + * with the others. + */ ret = btrfs_drop_extents(trans, root, inode, file_pos, file_pos + num_bytes, locked_end, - file_pos, &hint); + file_pos, &hint, 0); BUG_ON(ret); ins.objectid = inode->i_ino; @@ -1485,7 +1623,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); inode_add_bytes(inode, num_bytes); - btrfs_drop_extent_cache(inode, file_pos, file_pos + num_bytes - 1, 0); ins.objectid = disk_bytenr; ins.offset = disk_num_bytes; @@ -1596,6 +1733,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) ordered_extent->len, compressed, 0, 0, BTRFS_FILE_EXTENT_REG); + unpin_extent_cache(&BTRFS_I(inode)->extent_tree, + ordered_extent->file_offset, + ordered_extent->len); BUG_ON(ret); } unlock_extent(io_tree, ordered_extent->file_offset, @@ -1623,6 +1763,7 @@ nocow: static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, struct extent_state *state, int uptodate) { + ClearPagePrivate2(page); return btrfs_finish_ordered_io(page->mapping->host, start, end); } @@ -1669,13 +1810,13 @@ static int btrfs_io_failed_hook(struct bio *failed_bio, failrec->last_mirror = 0; failrec->bio_flags = 0; - spin_lock(&em_tree->lock); + read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, failrec->len); if (em->start > start || em->start + em->len < start) { free_extent_map(em); em = NULL; } - spin_unlock(&em_tree->lock); + read_unlock(&em_tree->lock); if (!em || IS_ERR(em)) { kfree(failrec); @@ -1794,7 +1935,7 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, return 0; if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && - test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1)) { + test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) { clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM, GFP_NOFS); return 0; @@ -2352,6 +2493,69 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) return ret; } +int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *dir, u64 objectid, + const char *name, int name_len) +{ + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_dir_item *di; + struct btrfs_key key; + u64 index; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, + name, name_len, -1); + BUG_ON(!di || IS_ERR(di)); + + leaf = path->nodes[0]; + btrfs_dir_item_key_to_cpu(leaf, di, &key); + WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); + ret = btrfs_delete_one_dir_name(trans, root, path, di); + BUG_ON(ret); + btrfs_release_path(root, path); + + ret = btrfs_del_root_ref(trans, root->fs_info->tree_root, + objectid, root->root_key.objectid, + dir->i_ino, &index, name, name_len); + if (ret < 0) { + BUG_ON(ret != -ENOENT); + di = btrfs_search_dir_index_item(root, path, dir->i_ino, + name, name_len); + BUG_ON(!di || IS_ERR(di)); + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + btrfs_release_path(root, path); + index = key.offset; + } + + di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, + index, name, name_len, -1); + BUG_ON(!di || IS_ERR(di)); + + leaf = path->nodes[0]; + btrfs_dir_item_key_to_cpu(leaf, di, &key); + WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); + ret = btrfs_delete_one_dir_name(trans, root, path, di); + BUG_ON(ret); + btrfs_release_path(root, path); + + btrfs_i_size_write(dir, dir->i_size - name_len * 2); + dir->i_mtime = dir->i_ctime = CURRENT_TIME; + ret = btrfs_update_inode(trans, root, dir); + BUG_ON(ret); + dir->i_sb->s_dirt = 1; + + btrfs_free_path(path); + return 0; +} + static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) { struct inode *inode = dentry->d_inode; @@ -2361,29 +2565,31 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) struct btrfs_trans_handle *trans; unsigned long nr = 0; - /* - * the FIRST_FREE_OBJECTID check makes sure we don't try to rmdir - * the root of a subvolume or snapshot - */ if (inode->i_size > BTRFS_EMPTY_DIR_SIZE || - inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) { + inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) return -ENOTEMPTY; - } trans = btrfs_start_transaction(root, 1); btrfs_set_trans_block_group(trans, dir); + if (unlikely(inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { + err = btrfs_unlink_subvol(trans, root, dir, + BTRFS_I(inode)->location.objectid, + dentry->d_name.name, + dentry->d_name.len); + goto out; + } + err = btrfs_orphan_add(trans, inode); if (err) - goto fail_trans; + goto out; /* now the directory is empty */ err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, dentry->d_name.name, dentry->d_name.len); if (!err) btrfs_i_size_write(inode, 0); - -fail_trans: +out: nr = trans->blocks_used; ret = btrfs_end_transaction_throttle(trans, root); btrfs_btree_balance_dirty(root, nr); @@ -2826,12 +3032,22 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from) if ((offset & (blocksize - 1)) == 0) goto out; + ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); + if (ret) + goto out; + + ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1); + if (ret) + goto out; ret = -ENOMEM; again: page = grab_cache_page(mapping, index); - if (!page) + if (!page) { + btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); + btrfs_unreserve_metadata_for_delalloc(root, inode, 1); goto out; + } page_start = page_offset(page); page_end = page_start + PAGE_CACHE_SIZE - 1; @@ -2864,7 +3080,16 @@ again: goto again; } - btrfs_set_extent_delalloc(inode, page_start, page_end); + clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, + EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, + GFP_NOFS); + + ret = btrfs_set_extent_delalloc(inode, page_start, page_end); + if (ret) { + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + goto out_unlock; + } + ret = 0; if (offset != PAGE_CACHE_SIZE) { kaddr = kmap(page); @@ -2877,6 +3102,9 @@ again: unlock_extent(io_tree, page_start, page_end, GFP_NOFS); out_unlock: + if (ret) + btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); + btrfs_unreserve_metadata_for_delalloc(root, inode, 1); unlock_page(page); page_cache_release(page); out: @@ -2895,17 +3123,15 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) u64 last_byte; u64 cur_offset; u64 hole_size; - int err; + int err = 0; if (size <= hole_start) return 0; - err = btrfs_check_metadata_free_space(root); + err = btrfs_truncate_page(inode->i_mapping, inode->i_size); if (err) return err; - btrfs_truncate_page(inode->i_mapping, inode->i_size); - while (1) { struct btrfs_ordered_extent *ordered; btrfs_wait_ordered_range(inode, hole_start, @@ -2935,15 +3161,21 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) cur_offset, cur_offset + hole_size, block_end, - cur_offset, &hint_byte); + cur_offset, &hint_byte, 1); + if (err) + break; + + err = btrfs_reserve_metadata_space(root, 1); if (err) break; + err = btrfs_insert_file_extent(trans, root, inode->i_ino, cur_offset, 0, 0, hole_size, 0, hole_size, 0, 0, 0); btrfs_drop_extent_cache(inode, hole_start, last_byte - 1, 0); + btrfs_unreserve_metadata_space(root, 1); } free_extent_map(em); cur_offset = last_byte; @@ -3003,6 +3235,11 @@ void btrfs_delete_inode(struct inode *inode) } btrfs_wait_ordered_range(inode, 0, (u64)-1); + if (inode->i_nlink > 0) { + BUG_ON(btrfs_root_refs(&root->root_item) != 0); + goto no_delete; + } + btrfs_i_size_write(inode, 0); trans = btrfs_join_transaction(root, 1); @@ -3070,29 +3307,67 @@ out_err: * is kind of like crossing a mount point. */ static int fixup_tree_root_location(struct btrfs_root *root, - struct btrfs_key *location, - struct btrfs_root **sub_root, - struct dentry *dentry) + struct inode *dir, + struct dentry *dentry, + struct btrfs_key *location, + struct btrfs_root **sub_root) { - struct btrfs_root_item *ri; + struct btrfs_path *path; + struct btrfs_root *new_root; + struct btrfs_root_ref *ref; + struct extent_buffer *leaf; + int ret; + int err = 0; - if (btrfs_key_type(location) != BTRFS_ROOT_ITEM_KEY) - return 0; - if (location->objectid == BTRFS_ROOT_TREE_OBJECTID) - return 0; + path = btrfs_alloc_path(); + if (!path) { + err = -ENOMEM; + goto out; + } - *sub_root = btrfs_read_fs_root(root->fs_info, location, - dentry->d_name.name, - dentry->d_name.len); - if (IS_ERR(*sub_root)) - return PTR_ERR(*sub_root); + err = -ENOENT; + ret = btrfs_find_root_ref(root->fs_info->tree_root, path, + BTRFS_I(dir)->root->root_key.objectid, + location->objectid); + if (ret) { + if (ret < 0) + err = ret; + goto out; + } - ri = &(*sub_root)->root_item; - location->objectid = btrfs_root_dirid(ri); - btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY); - location->offset = 0; + leaf = path->nodes[0]; + ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); + if (btrfs_root_ref_dirid(leaf, ref) != dir->i_ino || + btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len) + goto out; - return 0; + ret = memcmp_extent_buffer(leaf, dentry->d_name.name, + (unsigned long)(ref + 1), + dentry->d_name.len); + if (ret) + goto out; + + btrfs_release_path(root->fs_info->tree_root, path); + + new_root = btrfs_read_fs_root_no_name(root->fs_info, location); + if (IS_ERR(new_root)) { + err = PTR_ERR(new_root); + goto out; + } + + if (btrfs_root_refs(&new_root->root_item) == 0) { + err = -ENOENT; + goto out; + } + + *sub_root = new_root; + location->objectid = btrfs_root_dirid(&new_root->root_item); + location->type = BTRFS_INODE_ITEM_KEY; + location->offset = 0; + err = 0; +out: + btrfs_free_path(path); + return err; } static void inode_tree_add(struct inode *inode) @@ -3101,11 +3376,13 @@ static void inode_tree_add(struct inode *inode) struct btrfs_inode *entry; struct rb_node **p; struct rb_node *parent; - again: p = &root->inode_tree.rb_node; parent = NULL; + if (hlist_unhashed(&inode->i_hash)) + return; + spin_lock(&root->inode_lock); while (*p) { parent = *p; @@ -3132,13 +3409,87 @@ again: static void inode_tree_del(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; + int empty = 0; spin_lock(&root->inode_lock); if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) { rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree); RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); + empty = RB_EMPTY_ROOT(&root->inode_tree); } spin_unlock(&root->inode_lock); + + if (empty && btrfs_root_refs(&root->root_item) == 0) { + synchronize_srcu(&root->fs_info->subvol_srcu); + spin_lock(&root->inode_lock); + empty = RB_EMPTY_ROOT(&root->inode_tree); + spin_unlock(&root->inode_lock); + if (empty) + btrfs_add_dead_root(root); + } +} + +int btrfs_invalidate_inodes(struct btrfs_root *root) +{ + struct rb_node *node; + struct rb_node *prev; + struct btrfs_inode *entry; + struct inode *inode; + u64 objectid = 0; + + WARN_ON(btrfs_root_refs(&root->root_item) != 0); + + spin_lock(&root->inode_lock); +again: + node = root->inode_tree.rb_node; + prev = NULL; + while (node) { + prev = node; + entry = rb_entry(node, struct btrfs_inode, rb_node); + + if (objectid < entry->vfs_inode.i_ino) + node = node->rb_left; + else if (objectid > entry->vfs_inode.i_ino) + node = node->rb_right; + else + break; + } + if (!node) { + while (prev) { + entry = rb_entry(prev, struct btrfs_inode, rb_node); + if (objectid <= entry->vfs_inode.i_ino) { + node = prev; + break; + } + prev = rb_next(prev); + } + } + while (node) { + entry = rb_entry(node, struct btrfs_inode, rb_node); + objectid = entry->vfs_inode.i_ino + 1; + inode = igrab(&entry->vfs_inode); + if (inode) { + spin_unlock(&root->inode_lock); + if (atomic_read(&inode->i_count) > 1) + d_prune_aliases(inode); + /* + * btrfs_drop_inode will remove it from + * the inode cache when its usage count + * hits zero. + */ + iput(inode); + cond_resched(); + spin_lock(&root->inode_lock); + goto again; + } + + if (cond_resched_lock(&root->inode_lock)) + goto again; + + node = rb_next(node); + } + spin_unlock(&root->inode_lock); + return 0; } static noinline void init_btrfs_i(struct inode *inode) @@ -3148,6 +3499,7 @@ static noinline void init_btrfs_i(struct inode *inode) bi->generation = 0; bi->sequence = 0; bi->last_trans = 0; + bi->last_sub_trans = 0; bi->logged_trans = 0; bi->delalloc_bytes = 0; bi->reserved_bytes = 0; @@ -3225,15 +3577,41 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, return inode; } +static struct inode *new_simple_dir(struct super_block *s, + struct btrfs_key *key, + struct btrfs_root *root) +{ + struct inode *inode = new_inode(s); + + if (!inode) + return ERR_PTR(-ENOMEM); + + init_btrfs_i(inode); + + BTRFS_I(inode)->root = root; + memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); + BTRFS_I(inode)->dummy_inode = 1; + + inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID; + inode->i_op = &simple_dir_inode_operations; + inode->i_fop = &simple_dir_operations; + inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + + return inode; +} + struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) { struct inode *inode; - struct btrfs_inode *bi = BTRFS_I(dir); - struct btrfs_root *root = bi->root; + struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_root *sub_root = root; struct btrfs_key location; + int index; int ret; + dentry->d_op = &btrfs_dentry_operations; + if (dentry->d_name.len > BTRFS_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); @@ -3242,29 +3620,52 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) if (ret < 0) return ERR_PTR(ret); - inode = NULL; - if (location.objectid) { - ret = fixup_tree_root_location(root, &location, &sub_root, - dentry); - if (ret < 0) - return ERR_PTR(ret); - if (ret > 0) - return ERR_PTR(-ENOENT); + if (location.objectid == 0) + return NULL; + + if (location.type == BTRFS_INODE_ITEM_KEY) { + inode = btrfs_iget(dir->i_sb, &location, root); + return inode; + } + + BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY); + + index = srcu_read_lock(&root->fs_info->subvol_srcu); + ret = fixup_tree_root_location(root, dir, dentry, + &location, &sub_root); + if (ret < 0) { + if (ret != -ENOENT) + inode = ERR_PTR(ret); + else + inode = new_simple_dir(dir->i_sb, &location, sub_root); + } else { inode = btrfs_iget(dir->i_sb, &location, sub_root); - if (IS_ERR(inode)) - return ERR_CAST(inode); } + srcu_read_unlock(&root->fs_info->subvol_srcu, index); + return inode; } +static int btrfs_dentry_delete(struct dentry *dentry) +{ + struct btrfs_root *root; + + if (!dentry->d_inode && !IS_ROOT(dentry)) + dentry = dentry->d_parent; + + if (dentry->d_inode) { + root = BTRFS_I(dentry->d_inode)->root; + if (btrfs_root_refs(&root->root_item) == 0) + return 1; + } + return 0; +} + static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { struct inode *inode; - if (dentry->d_name.len > BTRFS_NAME_LEN) - return ERR_PTR(-ENAMETOOLONG); - inode = btrfs_lookup_dentry(dir, dentry); if (IS_ERR(inode)) return ERR_CAST(inode); @@ -3603,9 +4004,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, if (ret != 0) goto fail; - if (objectid > root->highest_inode) - root->highest_inode = objectid; - inode->i_uid = current_fsuid(); if (dir && (dir->i_mode & S_ISGID)) { @@ -3673,26 +4071,35 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, struct inode *parent_inode, struct inode *inode, const char *name, int name_len, int add_backref, u64 index) { - int ret; + int ret = 0; struct btrfs_key key; struct btrfs_root *root = BTRFS_I(parent_inode)->root; - key.objectid = inode->i_ino; - btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); - key.offset = 0; + if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) { + memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key)); + } else { + key.objectid = inode->i_ino; + btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.offset = 0; + } + + if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) { + ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, + key.objectid, root->root_key.objectid, + parent_inode->i_ino, + index, name, name_len); + } else if (add_backref) { + ret = btrfs_insert_inode_ref(trans, root, + name, name_len, inode->i_ino, + parent_inode->i_ino, index); + } - ret = btrfs_insert_dir_item(trans, root, name, name_len, - parent_inode->i_ino, - &key, btrfs_inode_type(inode), - index); if (ret == 0) { - if (add_backref) { - ret = btrfs_insert_inode_ref(trans, root, - name, name_len, - inode->i_ino, - parent_inode->i_ino, - index); - } + ret = btrfs_insert_dir_item(trans, root, name, name_len, + parent_inode->i_ino, &key, + btrfs_inode_type(inode), index); + BUG_ON(ret); + btrfs_i_size_write(parent_inode, parent_inode->i_size + name_len * 2); parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; @@ -3732,11 +4139,18 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, if (!new_valid_dev(rdev)) return -EINVAL; - err = btrfs_check_metadata_free_space(root); + /* + * 2 for inode item and ref + * 2 for dir items + * 1 for xattr if selinux is on + */ + err = btrfs_reserve_metadata_space(root, 5); if (err) - goto fail; + return err; trans = btrfs_start_transaction(root, 1); + if (!trans) + goto fail; btrfs_set_trans_block_group(trans, dir); err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); @@ -3774,6 +4188,7 @@ out_unlock: nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); fail: + btrfs_unreserve_metadata_space(root, 5); if (drop_inode) { inode_dec_link_count(inode); iput(inode); @@ -3794,10 +4209,18 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, u64 objectid; u64 index = 0; - err = btrfs_check_metadata_free_space(root); + /* + * 2 for inode item and ref + * 2 for dir items + * 1 for xattr if selinux is on + */ + err = btrfs_reserve_metadata_space(root, 5); if (err) - goto fail; + return err; + trans = btrfs_start_transaction(root, 1); + if (!trans) + goto fail; btrfs_set_trans_block_group(trans, dir); err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); @@ -3838,6 +4261,7 @@ out_unlock: nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); fail: + btrfs_unreserve_metadata_space(root, 5); if (drop_inode) { inode_dec_link_count(inode); iput(inode); @@ -3860,10 +4284,16 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, if (inode->i_nlink == 0) return -ENOENT; - btrfs_inc_nlink(inode); - err = btrfs_check_metadata_free_space(root); + /* + * 1 item for inode ref + * 2 items for dir items + */ + err = btrfs_reserve_metadata_space(root, 3); if (err) - goto fail; + return err; + + btrfs_inc_nlink(inode); + err = btrfs_set_inode_index(dir, &index); if (err) goto fail; @@ -3875,20 +4305,19 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, err = btrfs_add_nondir(trans, dentry, inode, 1, index); - if (err) - drop_inode = 1; - - btrfs_update_inode_block_group(trans, dir); - err = btrfs_update_inode(trans, root, inode); - - if (err) + if (err) { drop_inode = 1; + } else { + btrfs_update_inode_block_group(trans, dir); + err = btrfs_update_inode(trans, root, inode); + BUG_ON(err); + btrfs_log_new_name(trans, inode, NULL, dentry->d_parent); + } nr = trans->blocks_used; - - btrfs_log_new_name(trans, inode, NULL, dentry->d_parent); btrfs_end_transaction_throttle(trans, root); fail: + btrfs_unreserve_metadata_space(root, 3); if (drop_inode) { inode_dec_link_count(inode); iput(inode); @@ -3908,17 +4337,21 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) u64 index = 0; unsigned long nr = 1; - err = btrfs_check_metadata_free_space(root); + /* + * 2 items for inode and ref + * 2 items for dir items + * 1 for xattr if selinux is on + */ + err = btrfs_reserve_metadata_space(root, 5); if (err) - goto out_unlock; + return err; trans = btrfs_start_transaction(root, 1); - btrfs_set_trans_block_group(trans, dir); - - if (IS_ERR(trans)) { - err = PTR_ERR(trans); + if (!trans) { + err = -ENOMEM; goto out_unlock; } + btrfs_set_trans_block_group(trans, dir); err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); if (err) { @@ -3967,6 +4400,7 @@ out_fail: btrfs_end_transaction_throttle(trans, root); out_unlock: + btrfs_unreserve_metadata_space(root, 5); if (drop_on_err) iput(inode); btrfs_btree_balance_dirty(root, nr); @@ -4064,11 +4498,11 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, int compressed; again: - spin_lock(&em_tree->lock); + read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, len); if (em) em->bdev = root->fs_info->fs_devices->latest_bdev; - spin_unlock(&em_tree->lock); + read_unlock(&em_tree->lock); if (em) { if (em->start > start || em->start + em->len <= start) @@ -4215,6 +4649,11 @@ again: map = kmap(page); read_extent_buffer(leaf, map + pg_offset, ptr, copy_size); + if (pg_offset + copy_size < PAGE_CACHE_SIZE) { + memset(map + pg_offset + copy_size, 0, + PAGE_CACHE_SIZE - pg_offset - + copy_size); + } kunmap(page); } flush_dcache_page(page); @@ -4259,7 +4698,7 @@ insert: } err = 0; - spin_lock(&em_tree->lock); + write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); /* it is possible that someone inserted the extent into the tree * while we had the lock dropped. It is also possible that @@ -4299,7 +4738,7 @@ insert: err = 0; } } - spin_unlock(&em_tree->lock); + write_unlock(&em_tree->lock); out: if (path) btrfs_free_path(path); @@ -4398,13 +4837,21 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) u64 page_start = page_offset(page); u64 page_end = page_start + PAGE_CACHE_SIZE - 1; + + /* + * we have the page locked, so new writeback can't start, + * and the dirty bit won't be cleared while we are here. + * + * Wait for IO on this page so that we can safely clear + * the PagePrivate2 bit and do ordered accounting + */ wait_on_page_writeback(page); + tree = &BTRFS_I(page->mapping->host)->io_tree; if (offset) { btrfs_releasepage(page, GFP_NOFS); return; } - lock_extent(tree, page_start, page_end, GFP_NOFS); ordered = btrfs_lookup_ordered_extent(page->mapping->host, page_offset(page)); @@ -4415,16 +4862,22 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) */ clear_extent_bit(tree, page_start, page_end, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_LOCKED, 1, 0, GFP_NOFS); - btrfs_finish_ordered_io(page->mapping->host, - page_start, page_end); + EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0, + NULL, GFP_NOFS); + /* + * whoever cleared the private bit is responsible + * for the finish_ordered_io + */ + if (TestClearPagePrivate2(page)) { + btrfs_finish_ordered_io(page->mapping->host, + page_start, page_end); + } btrfs_put_ordered_extent(ordered); lock_extent(tree, page_start, page_end, GFP_NOFS); } clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_ORDERED, - 1, 1, GFP_NOFS); + EXTENT_DO_ACCOUNTING, 1, 1, NULL, GFP_NOFS); __btrfs_releasepage(page, GFP_NOFS); ClearPageChecked(page); @@ -4473,6 +4926,13 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) goto out; } + ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1); + if (ret) { + btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); + ret = VM_FAULT_SIGBUS; + goto out; + } + ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ again: lock_page(page); @@ -4504,7 +4964,24 @@ again: goto again; } - btrfs_set_extent_delalloc(inode, page_start, page_end); + /* + * XXX - page_mkwrite gets called every time the page is dirtied, even + * if it was already dirty, so for space accounting reasons we need to + * clear any delalloc bits for the range we are fixing to save. There + * is probably a better way to do this, but for now keep consistent with + * prepare_pages in the normal write path. + */ + clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, + EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, + GFP_NOFS); + + ret = btrfs_set_extent_delalloc(inode, page_start, page_end); + if (ret) { + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + ret = VM_FAULT_SIGBUS; + btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); + goto out_unlock; + } ret = 0; /* page is wholly or partially inside EOF */ @@ -4521,11 +4998,17 @@ again: } ClearPageChecked(page); set_page_dirty(page); + SetPageUptodate(page); + + BTRFS_I(inode)->last_trans = root->fs_info->generation; + BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; - BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; unlock_extent(io_tree, page_start, page_end, GFP_NOFS); out_unlock: + btrfs_unreserve_metadata_for_delalloc(root, inode, 1); + if (!ret) + return VM_FAULT_LOCKED; unlock_page(page); out: return ret; @@ -4544,7 +5027,9 @@ static void btrfs_truncate(struct inode *inode) if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) return; - btrfs_truncate_page(inode->i_mapping, inode->i_size); + ret = btrfs_truncate_page(inode->i_mapping, inode->i_size); + if (ret) + return; btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); trans = btrfs_start_transaction(root, 1); @@ -4594,11 +5079,11 @@ out: * create a new subvolume directory/inode (helper for the ioctl). */ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, - struct btrfs_root *new_root, struct dentry *dentry, + struct btrfs_root *new_root, u64 new_dirid, u64 alloc_hint) { struct inode *inode; - int error; + int err; u64 index = 0; inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid, @@ -4611,11 +5096,10 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, inode->i_nlink = 1; btrfs_i_size_write(inode, 0); - error = btrfs_update_inode(trans, new_root, inode); - if (error) - return error; + err = btrfs_update_inode(trans, new_root, inode); + BUG_ON(err); - d_instantiate(dentry, inode); + iput(inode); return 0; } @@ -4640,7 +5124,11 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) if (!ei) return NULL; ei->last_trans = 0; + ei->last_sub_trans = 0; ei->logged_trans = 0; + ei->outstanding_extents = 0; + ei->reserved_extents = 0; + spin_lock_init(&ei->accounting_lock); btrfs_ordered_inode_tree_init(&ei->ordered_tree); INIT_LIST_HEAD(&ei->i_orphan); INIT_LIST_HEAD(&ei->ordered_operations); @@ -4693,6 +5181,16 @@ void btrfs_destroy_inode(struct inode *inode) kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); } +void btrfs_drop_inode(struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + + if (inode->i_nlink > 0 && btrfs_root_refs(&root->root_item) == 0) + generic_delete_inode(inode); + else + generic_drop_inode(inode); +} + static void init_once(void *foo) { struct btrfs_inode *ei = (struct btrfs_inode *) foo; @@ -4761,31 +5259,37 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, { struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(old_dir)->root; + struct btrfs_root *dest = BTRFS_I(new_dir)->root; struct inode *new_inode = new_dentry->d_inode; struct inode *old_inode = old_dentry->d_inode; struct timespec ctime = CURRENT_TIME; u64 index = 0; + u64 root_objectid; int ret; - /* we're not allowed to rename between subvolumes */ - if (BTRFS_I(old_inode)->root->root_key.objectid != - BTRFS_I(new_dir)->root->root_key.objectid) + if (new_dir->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) + return -EPERM; + + /* we only allow rename subvolume link between subvolumes */ + if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) return -EXDEV; + if (old_inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID || + (new_inode && new_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) + return -ENOTEMPTY; + if (S_ISDIR(old_inode->i_mode) && new_inode && - new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) { + new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) return -ENOTEMPTY; - } - /* to rename a snapshot or subvolume, we need to juggle the - * backrefs. This isn't coded yet + /* + * 2 items for dir items + * 1 item for orphan entry + * 1 item for ref */ - if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) - return -EXDEV; - - ret = btrfs_check_metadata_free_space(root); + ret = btrfs_reserve_metadata_space(root, 4); if (ret) - goto out_unlock; + return ret; /* * we're using rename to replace one file with another. @@ -4796,8 +5300,40 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) filemap_flush(old_inode->i_mapping); + /* close the racy window with snapshot create/destroy ioctl */ + if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) + down_read(&root->fs_info->subvol_sem); + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, new_dir); + + if (dest != root) + btrfs_record_root_in_trans(trans, dest); + ret = btrfs_set_inode_index(new_dir, &index); + if (ret) + goto out_fail; + + if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) { + /* force full log commit if subvolume involved. */ + root->fs_info->last_trans_log_full_commit = trans->transid; + } else { + ret = btrfs_insert_inode_ref(trans, dest, + new_dentry->d_name.name, + new_dentry->d_name.len, + old_inode->i_ino, + new_dir->i_ino, index); + if (ret) + goto out_fail; + /* + * this is an ugly little race, but the rename is required + * to make sure that if we crash, the inode is either at the + * old name or the new one. pinning the log transaction lets + * us make sure we don't allow a log commit to come in after + * we unlink the name but before we add the new name back in. + */ + btrfs_pin_log_trans(root); + } /* * make sure the inode gets flushed if it is replacing * something. @@ -4807,18 +5343,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, btrfs_add_ordered_operation(trans, root, old_inode); } - /* - * this is an ugly little race, but the rename is required to make - * sure that if we crash, the inode is either at the old name - * or the new one. pinning the log transaction lets us make sure - * we don't allow a log commit to come in after we unlink the - * name but before we add the new name back in. - */ - btrfs_pin_log_trans(root); - - btrfs_set_trans_block_group(trans, new_dir); - - btrfs_inc_nlink(old_dentry->d_inode); old_dir->i_ctime = old_dir->i_mtime = ctime; new_dir->i_ctime = new_dir->i_mtime = ctime; old_inode->i_ctime = ctime; @@ -4826,47 +5350,60 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (old_dentry->d_parent != new_dentry->d_parent) btrfs_record_unlink_dir(trans, old_dir, old_inode, 1); - ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode, - old_dentry->d_name.name, - old_dentry->d_name.len); - if (ret) - goto out_fail; + if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) { + root_objectid = BTRFS_I(old_inode)->root->root_key.objectid; + ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid, + old_dentry->d_name.name, + old_dentry->d_name.len); + } else { + btrfs_inc_nlink(old_dentry->d_inode); + ret = btrfs_unlink_inode(trans, root, old_dir, + old_dentry->d_inode, + old_dentry->d_name.name, + old_dentry->d_name.len); + } + BUG_ON(ret); if (new_inode) { new_inode->i_ctime = CURRENT_TIME; - ret = btrfs_unlink_inode(trans, root, new_dir, - new_dentry->d_inode, - new_dentry->d_name.name, - new_dentry->d_name.len); - if (ret) - goto out_fail; + if (unlikely(new_inode->i_ino == + BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { + root_objectid = BTRFS_I(new_inode)->location.objectid; + ret = btrfs_unlink_subvol(trans, dest, new_dir, + root_objectid, + new_dentry->d_name.name, + new_dentry->d_name.len); + BUG_ON(new_inode->i_nlink == 0); + } else { + ret = btrfs_unlink_inode(trans, dest, new_dir, + new_dentry->d_inode, + new_dentry->d_name.name, + new_dentry->d_name.len); + } + BUG_ON(ret); if (new_inode->i_nlink == 0) { ret = btrfs_orphan_add(trans, new_dentry->d_inode); - if (ret) - goto out_fail; + BUG_ON(ret); } - } - ret = btrfs_set_inode_index(new_dir, &index); - if (ret) - goto out_fail; - ret = btrfs_add_link(trans, new_dentry->d_parent->d_inode, - old_inode, new_dentry->d_name.name, - new_dentry->d_name.len, 1, index); - if (ret) - goto out_fail; + ret = btrfs_add_link(trans, new_dir, old_inode, + new_dentry->d_name.name, + new_dentry->d_name.len, 0, index); + BUG_ON(ret); - btrfs_log_new_name(trans, old_inode, old_dir, - new_dentry->d_parent); + if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) { + btrfs_log_new_name(trans, old_inode, old_dir, + new_dentry->d_parent); + btrfs_end_log_trans(root); + } out_fail: - - /* this btrfs_end_log_trans just allows the current - * log-sub transaction to complete - */ - btrfs_end_log_trans(root); btrfs_end_transaction_throttle(trans, root); -out_unlock: + + if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) + up_read(&root->fs_info->subvol_sem); + + btrfs_unreserve_metadata_space(root, 4); return ret; } @@ -4938,11 +5475,18 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) return -ENAMETOOLONG; - err = btrfs_check_metadata_free_space(root); + /* + * 2 items for inode item and ref + * 2 items for dir items + * 1 item for xattr if selinux is on + */ + err = btrfs_reserve_metadata_space(root, 5); if (err) - goto out_fail; + return err; trans = btrfs_start_transaction(root, 1); + if (!trans) + goto out_fail; btrfs_set_trans_block_group(trans, dir); err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); @@ -5023,6 +5567,7 @@ out_unlock: nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); out_fail: + btrfs_unreserve_metadata_space(root, 5); if (drop_inode) { inode_dec_link_count(inode); iput(inode); @@ -5044,6 +5589,11 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans, while (num_bytes > 0) { alloc_size = min(num_bytes, root->fs_info->max_extent); + + ret = btrfs_reserve_metadata_space(root, 1); + if (ret) + goto out; + ret = btrfs_reserve_extent(trans, root, alloc_size, root->sectorsize, 0, alloc_hint, (u64)-1, &ins, 1); @@ -5058,9 +5608,12 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans, 0, 0, 0, BTRFS_FILE_EXTENT_PREALLOC); BUG_ON(ret); + btrfs_drop_extent_cache(inode, cur_offset, + cur_offset + ins.offset -1, 0); num_bytes -= ins.offset; cur_offset += ins.offset; alloc_hint = ins.objectid + ins.offset; + btrfs_unreserve_metadata_space(root, 1); } out: if (cur_offset > start) { @@ -5223,7 +5776,8 @@ static const struct inode_operations btrfs_dir_ro_inode_operations = { .lookup = btrfs_lookup, .permission = btrfs_permission, }; -static struct file_operations btrfs_dir_file_operations = { + +static const struct file_operations btrfs_dir_file_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, .readdir = btrfs_real_readdir, @@ -5245,6 +5799,8 @@ static struct extent_io_ops btrfs_extent_io_ops = { .readpage_io_failed_hook = btrfs_io_failed_hook, .set_bit_hook = btrfs_set_bit_hook, .clear_bit_hook = btrfs_clear_bit_hook, + .merge_extent_hook = btrfs_merge_extent_hook, + .split_extent_hook = btrfs_split_extent_hook, }; /* @@ -5269,6 +5825,7 @@ static const struct address_space_operations btrfs_aops = { .invalidatepage = btrfs_invalidatepage, .releasepage = btrfs_releasepage, .set_page_dirty = btrfs_set_page_dirty, + .error_remove_page = generic_error_remove_page, }; static const struct address_space_operations btrfs_symlink_aops = { @@ -5309,3 +5866,7 @@ static const struct inode_operations btrfs_symlink_inode_operations = { .listxattr = btrfs_listxattr, .removexattr = btrfs_removexattr, }; + +const struct dentry_operations btrfs_dentry_operations = { + .d_delete = btrfs_dentry_delete, +}; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index bd88f25..cdbb054 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -230,8 +230,8 @@ static noinline int create_subvol(struct btrfs_root *root, struct btrfs_root_item root_item; struct btrfs_inode_item *inode_item; struct extent_buffer *leaf; - struct btrfs_root *new_root = root; - struct inode *dir; + struct btrfs_root *new_root; + struct inode *dir = dentry->d_parent->d_inode; int ret; int err; u64 objectid; @@ -239,9 +239,15 @@ static noinline int create_subvol(struct btrfs_root *root, u64 index = 0; unsigned long nr = 1; - ret = btrfs_check_metadata_free_space(root); + /* + * 1 - inode item + * 2 - refs + * 1 - root item + * 2 - dir items + */ + ret = btrfs_reserve_metadata_space(root, 6); if (ret) - goto fail_commit; + return ret; trans = btrfs_start_transaction(root, 1); BUG_ON(!trans); @@ -304,11 +310,17 @@ static noinline int create_subvol(struct btrfs_root *root, if (ret) goto fail; + key.offset = (u64)-1; + new_root = btrfs_read_fs_root_no_name(root->fs_info, &key); + BUG_ON(IS_ERR(new_root)); + + btrfs_record_root_in_trans(trans, new_root); + + ret = btrfs_create_subvol_root(trans, new_root, new_dirid, + BTRFS_I(dir)->block_group); /* * insert the directory item */ - key.offset = (u64)-1; - dir = dentry->d_parent->d_inode; ret = btrfs_set_inode_index(dir, &index); BUG_ON(ret); @@ -322,43 +334,20 @@ static noinline int create_subvol(struct btrfs_root *root, ret = btrfs_update_inode(trans, root, dir); BUG_ON(ret); - /* add the backref first */ ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, - objectid, BTRFS_ROOT_BACKREF_KEY, - root->root_key.objectid, + objectid, root->root_key.objectid, dir->i_ino, index, name, namelen); BUG_ON(ret); - /* now add the forward ref */ - ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, - root->root_key.objectid, BTRFS_ROOT_REF_KEY, - objectid, - dir->i_ino, index, name, namelen); - - BUG_ON(ret); - - ret = btrfs_commit_transaction(trans, root); - if (ret) - goto fail_commit; - - new_root = btrfs_read_fs_root_no_name(root->fs_info, &key); - BUG_ON(!new_root); - - trans = btrfs_start_transaction(new_root, 1); - BUG_ON(!trans); - - ret = btrfs_create_subvol_root(trans, new_root, dentry, new_dirid, - BTRFS_I(dir)->block_group); - if (ret) - goto fail; - + d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry)); fail: nr = trans->blocks_used; - err = btrfs_commit_transaction(trans, new_root); + err = btrfs_commit_transaction(trans, root); if (err && !ret) ret = err; -fail_commit: + + btrfs_unreserve_metadata_space(root, 6); btrfs_btree_balance_dirty(root, nr); return ret; } @@ -375,19 +364,27 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, if (!root->ref_cows) return -EINVAL; - ret = btrfs_check_metadata_free_space(root); + /* + * 1 - inode item + * 2 - refs + * 1 - root item + * 2 - dir items + */ + ret = btrfs_reserve_metadata_space(root, 6); if (ret) goto fail_unlock; pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); if (!pending_snapshot) { ret = -ENOMEM; + btrfs_unreserve_metadata_space(root, 6); goto fail_unlock; } pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS); if (!pending_snapshot->name) { ret = -ENOMEM; kfree(pending_snapshot); + btrfs_unreserve_metadata_space(root, 6); goto fail_unlock; } memcpy(pending_snapshot->name, name, namelen); @@ -420,14 +417,15 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child) * sys_mkdirat and vfs_mkdir, but we only do a single component lookup * inside this filesystem so it's quite a bit simpler. */ -static noinline int btrfs_mksubvol(struct path *parent, char *name, - int mode, int namelen, +static noinline int btrfs_mksubvol(struct path *parent, + char *name, int namelen, struct btrfs_root *snap_src) { + struct inode *dir = parent->dentry->d_inode; struct dentry *dentry; int error; - mutex_lock_nested(&parent->dentry->d_inode->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); dentry = lookup_one_len(name, parent->dentry, namelen); error = PTR_ERR(dentry); @@ -438,99 +436,39 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name, if (dentry->d_inode) goto out_dput; - if (!IS_POSIXACL(parent->dentry->d_inode)) - mode &= ~current_umask(); - error = mnt_want_write(parent->mnt); if (error) goto out_dput; - error = btrfs_may_create(parent->dentry->d_inode, dentry); + error = btrfs_may_create(dir, dentry); if (error) goto out_drop_write; - /* - * Actually perform the low-level subvolume creation after all - * this VFS fuzz. - * - * Eventually we want to pass in an inode under which we create this - * subvolume, but for now all are under the filesystem root. - * - * Also we should pass on the mode eventually to allow creating new - * subvolume with specific mode bits. - */ + down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem); + + if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0) + goto out_up_read; + if (snap_src) { - struct dentry *dir = dentry->d_parent; - struct dentry *test = dir->d_parent; - struct btrfs_path *path = btrfs_alloc_path(); - int ret; - u64 test_oid; - u64 parent_oid = BTRFS_I(dir->d_inode)->root->root_key.objectid; - - test_oid = snap_src->root_key.objectid; - - ret = btrfs_find_root_ref(snap_src->fs_info->tree_root, - path, parent_oid, test_oid); - if (ret == 0) - goto create; - btrfs_release_path(snap_src->fs_info->tree_root, path); - - /* we need to make sure we aren't creating a directory loop - * by taking a snapshot of something that has our current - * subvol in its directory tree. So, this loops through - * the dentries and checks the forward refs for each subvolume - * to see if is references the subvolume where we are - * placing this new snapshot. - */ - while (1) { - if (!test || - dir == snap_src->fs_info->sb->s_root || - test == snap_src->fs_info->sb->s_root || - test->d_inode->i_sb != snap_src->fs_info->sb) { - break; - } - if (S_ISLNK(test->d_inode->i_mode)) { - printk(KERN_INFO "Btrfs symlink in snapshot " - "path, failed\n"); - error = -EMLINK; - btrfs_free_path(path); - goto out_drop_write; - } - test_oid = - BTRFS_I(test->d_inode)->root->root_key.objectid; - ret = btrfs_find_root_ref(snap_src->fs_info->tree_root, - path, test_oid, parent_oid); - if (ret == 0) { - printk(KERN_INFO "Btrfs snapshot creation " - "failed, looping\n"); - error = -EMLINK; - btrfs_free_path(path); - goto out_drop_write; - } - btrfs_release_path(snap_src->fs_info->tree_root, path); - test = test->d_parent; - } -create: - btrfs_free_path(path); - error = create_snapshot(snap_src, dentry, name, namelen); + error = create_snapshot(snap_src, dentry, + name, namelen); } else { - error = create_subvol(BTRFS_I(parent->dentry->d_inode)->root, - dentry, name, namelen); + error = create_subvol(BTRFS_I(dir)->root, dentry, + name, namelen); } - if (error) - goto out_drop_write; - - fsnotify_mkdir(parent->dentry->d_inode, dentry); + if (!error) + fsnotify_mkdir(dir, dentry); +out_up_read: + up_read(&BTRFS_I(dir)->root->fs_info->subvol_sem); out_drop_write: mnt_drop_write(parent->mnt); out_dput: dput(dentry); out_unlock: - mutex_unlock(&parent->dentry->d_inode->i_mutex); + mutex_unlock(&dir->i_mutex); return error; } - static int btrfs_defrag_file(struct file *file) { struct inode *inode = fdentry(file)->d_inode; @@ -596,9 +534,8 @@ again: clear_page_dirty_for_io(page); btrfs_set_extent_delalloc(inode, page_start, page_end); - - unlock_extent(io_tree, page_start, page_end, GFP_NOFS); set_page_dirty(page); + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); unlock_page(page); page_cache_release(page); balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1); @@ -609,7 +546,8 @@ out_unlock: return 0; } -static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg) +static noinline int btrfs_ioctl_resize(struct btrfs_root *root, + void __user *arg) { u64 new_size; u64 old_size; @@ -718,10 +656,7 @@ static noinline int btrfs_ioctl_snap_create(struct file *file, { struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; struct btrfs_ioctl_vol_args *vol_args; - struct btrfs_dir_item *di; - struct btrfs_path *path; struct file *src_file; - u64 root_dirid; int namelen; int ret = 0; @@ -739,32 +674,9 @@ static noinline int btrfs_ioctl_snap_create(struct file *file, goto out; } - path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } - - root_dirid = root->fs_info->sb->s_root->d_inode->i_ino, - di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root, - path, root_dirid, - vol_args->name, namelen, 0); - btrfs_free_path(path); - - if (di && !IS_ERR(di)) { - ret = -EEXIST; - goto out; - } - - if (IS_ERR(di)) { - ret = PTR_ERR(di); - goto out; - } - if (subvol) { - ret = btrfs_mksubvol(&file->f_path, vol_args->name, - file->f_path.dentry->d_inode->i_mode, - namelen, NULL); + ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen, + NULL); } else { struct inode *src_inode; src_file = fget(vol_args->fd); @@ -781,17 +693,157 @@ static noinline int btrfs_ioctl_snap_create(struct file *file, fput(src_file); goto out; } - ret = btrfs_mksubvol(&file->f_path, vol_args->name, - file->f_path.dentry->d_inode->i_mode, - namelen, BTRFS_I(src_inode)->root); + ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen, + BTRFS_I(src_inode)->root); fput(src_file); } - out: kfree(vol_args); return ret; } +/* + * helper to check if the subvolume references other subvolumes + */ +static noinline int may_destroy_subvol(struct btrfs_root *root) +{ + struct btrfs_path *path; + struct btrfs_key key; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = root->root_key.objectid; + key.type = BTRFS_ROOT_REF_KEY; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, root->fs_info->tree_root, + &key, path, 0, 0); + if (ret < 0) + goto out; + BUG_ON(ret == 0); + + ret = 0; + if (path->slots[0] > 0) { + path->slots[0]--; + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid == root->root_key.objectid && + key.type == BTRFS_ROOT_REF_KEY) + ret = -ENOTEMPTY; + } +out: + btrfs_free_path(path); + return ret; +} + +static noinline int btrfs_ioctl_snap_destroy(struct file *file, + void __user *arg) +{ + struct dentry *parent = fdentry(file); + struct dentry *dentry; + struct inode *dir = parent->d_inode; + struct inode *inode; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_root *dest = NULL; + struct btrfs_ioctl_vol_args *vol_args; + struct btrfs_trans_handle *trans; + int namelen; + int ret; + int err = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); + + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + namelen = strlen(vol_args->name); + if (strchr(vol_args->name, '/') || + strncmp(vol_args->name, "..", namelen) == 0) { + err = -EINVAL; + goto out; + } + + err = mnt_want_write(file->f_path.mnt); + if (err) + goto out; + + mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + dentry = lookup_one_len(vol_args->name, parent, namelen); + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + goto out_unlock_dir; + } + + if (!dentry->d_inode) { + err = -ENOENT; + goto out_dput; + } + + inode = dentry->d_inode; + if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) { + err = -EINVAL; + goto out_dput; + } + + dest = BTRFS_I(inode)->root; + + mutex_lock(&inode->i_mutex); + err = d_invalidate(dentry); + if (err) + goto out_unlock; + + down_write(&root->fs_info->subvol_sem); + + err = may_destroy_subvol(dest); + if (err) + goto out_up_write; + + trans = btrfs_start_transaction(root, 1); + ret = btrfs_unlink_subvol(trans, root, dir, + dest->root_key.objectid, + dentry->d_name.name, + dentry->d_name.len); + BUG_ON(ret); + + btrfs_record_root_in_trans(trans, dest); + + memset(&dest->root_item.drop_progress, 0, + sizeof(dest->root_item.drop_progress)); + dest->root_item.drop_level = 0; + btrfs_set_root_refs(&dest->root_item, 0); + + ret = btrfs_insert_orphan_item(trans, + root->fs_info->tree_root, + dest->root_key.objectid); + BUG_ON(ret); + + ret = btrfs_commit_transaction(trans, root); + BUG_ON(ret); + inode->i_flags |= S_DEAD; +out_up_write: + up_write(&root->fs_info->subvol_sem); +out_unlock: + mutex_unlock(&inode->i_mutex); + if (!err) { + shrink_dcache_sb(root->fs_info->sb); + btrfs_invalidate_inodes(dest); + d_delete(dentry); + } +out_dput: + dput(dentry); +out_unlock_dir: + mutex_unlock(&dir->i_mutex); + mnt_drop_write(file->f_path.mnt); +out: + kfree(vol_args); + return err; +} + static int btrfs_ioctl_defrag(struct file *file) { struct inode *inode = fdentry(file)->d_inode; @@ -865,8 +917,8 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) return ret; } -static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, - u64 off, u64 olen, u64 destoff) +static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, + u64 off, u64 olen, u64 destoff) { struct inode *inode = fdentry(file)->d_inode; struct btrfs_root *root = BTRFS_I(inode)->root; @@ -976,7 +1028,7 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, /* punch hole in destination first */ btrfs_drop_extents(trans, root, inode, off, off + len, - off + len, 0, &hint_byte); + off + len, 0, &hint_byte, 1); /* clone data */ key.objectid = src->i_ino; @@ -1071,9 +1123,10 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, datao += off - key.offset; datal -= off - key.offset; } - if (key.offset + datao + datal + key.offset > - off + len) - datal = off + len - key.offset - datao; + + if (key.offset + datal > off + len) + datal = off + len - key.offset; + /* disko == 0 means it's a hole */ if (!disko) datao = 0; @@ -1182,15 +1235,15 @@ static long btrfs_ioctl_trans_start(struct file *file) struct inode *inode = fdentry(file)->d_inode; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; - int ret = 0; + int ret; + ret = -EPERM; if (!capable(CAP_SYS_ADMIN)) - return -EPERM; + goto out; - if (file->private_data) { - ret = -EINPROGRESS; + ret = -EINPROGRESS; + if (file->private_data) goto out; - } ret = mnt_want_write(file->f_path.mnt); if (ret) @@ -1200,12 +1253,19 @@ static long btrfs_ioctl_trans_start(struct file *file) root->fs_info->open_ioctl_trans++; mutex_unlock(&root->fs_info->trans_mutex); + ret = -ENOMEM; trans = btrfs_start_ioctl_transaction(root, 0); - if (trans) - file->private_data = trans; - else - ret = -ENOMEM; - /*printk(KERN_INFO "btrfs_ioctl_trans_start on %p\n", file);*/ + if (!trans) + goto out_drop; + + file->private_data = trans; + return 0; + +out_drop: + mutex_lock(&root->fs_info->trans_mutex); + root->fs_info->open_ioctl_trans--; + mutex_unlock(&root->fs_info->trans_mutex); + mnt_drop_write(file->f_path.mnt); out: return ret; } @@ -1221,24 +1281,20 @@ long btrfs_ioctl_trans_end(struct file *file) struct inode *inode = fdentry(file)->d_inode; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; - int ret = 0; trans = file->private_data; - if (!trans) { - ret = -EINVAL; - goto out; - } - btrfs_end_transaction(trans, root); + if (!trans) + return -EINVAL; file->private_data = NULL; + btrfs_end_transaction(trans, root); + mutex_lock(&root->fs_info->trans_mutex); root->fs_info->open_ioctl_trans--; mutex_unlock(&root->fs_info->trans_mutex); mnt_drop_write(file->f_path.mnt); - -out: - return ret; + return 0; } long btrfs_ioctl(struct file *file, unsigned int @@ -1258,6 +1314,8 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_snap_create(file, argp, 0); case BTRFS_IOC_SUBVOL_CREATE: return btrfs_ioctl_snap_create(file, argp, 1); + case BTRFS_IOC_SNAP_DESTROY: + return btrfs_ioctl_snap_destroy(file, argp); case BTRFS_IOC_DEFRAG: return btrfs_ioctl_defrag(file); case BTRFS_IOC_RESIZE: diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index b320b10..bc49914 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -65,5 +65,6 @@ struct btrfs_ioctl_clone_range_args { #define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \ struct btrfs_ioctl_vol_args) - +#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \ + struct btrfs_ioctl_vol_args) #endif diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 7b2f401..5799bc4 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -159,8 +159,6 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, * * len is the length of the extent * - * This also sets the EXTENT_ORDERED bit on the range in the inode. - * * The tree is given a single reference on the ordered extent that was * inserted. */ @@ -181,6 +179,7 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, entry->start = start; entry->len = len; entry->disk_len = disk_len; + entry->bytes_left = len; entry->inode = inode; if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) set_bit(type, &entry->flags); @@ -195,9 +194,6 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, &entry->rb_node); BUG_ON(node); - set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset, - entry_end(entry) - 1, GFP_NOFS); - spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); list_add_tail(&entry->root_extent_list, &BTRFS_I(inode)->root->fs_info->ordered_extents); @@ -241,13 +237,10 @@ int btrfs_dec_test_ordered_pending(struct inode *inode, struct btrfs_ordered_inode_tree *tree; struct rb_node *node; struct btrfs_ordered_extent *entry; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; int ret; tree = &BTRFS_I(inode)->ordered_tree; mutex_lock(&tree->mutex); - clear_extent_ordered(io_tree, file_offset, file_offset + io_size - 1, - GFP_NOFS); node = tree_search(tree, file_offset); if (!node) { ret = 1; @@ -260,11 +253,16 @@ int btrfs_dec_test_ordered_pending(struct inode *inode, goto out; } - ret = test_range_bit(io_tree, entry->file_offset, - entry->file_offset + entry->len - 1, - EXTENT_ORDERED, 0); - if (ret == 0) + if (io_size > entry->bytes_left) { + printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n", + (unsigned long long)entry->bytes_left, + (unsigned long long)io_size); + } + entry->bytes_left -= io_size; + if (entry->bytes_left == 0) ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); + else + ret = 1; out: mutex_unlock(&tree->mutex); return ret == 0; @@ -308,6 +306,12 @@ int btrfs_remove_ordered_extent(struct inode *inode, tree->last = NULL; set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); + spin_lock(&BTRFS_I(inode)->accounting_lock); + BTRFS_I(inode)->outstanding_extents--; + spin_unlock(&BTRFS_I(inode)->accounting_lock); + btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root, + inode, 1); + spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); list_del_init(&entry->root_extent_list); @@ -460,7 +464,7 @@ void btrfs_start_ordered_extent(struct inode *inode, * start IO on any dirty ones so the wait doesn't stall waiting * for pdflush to find them */ - btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_ALL); + filemap_fdatawrite_range(inode->i_mapping, start, end); if (wait) { wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, &entry->flags)); @@ -476,6 +480,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) u64 orig_end; u64 wait_end; struct btrfs_ordered_extent *ordered; + int found; if (start + len < start) { orig_end = INT_LIMIT(loff_t); @@ -489,19 +494,18 @@ again: /* start IO across the range first to instantiate any delalloc * extents */ - btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL); + filemap_fdatawrite_range(inode->i_mapping, start, orig_end); /* The compression code will leave pages locked but return from * writepage without setting the page writeback. Starting again * with WB_SYNC_ALL will end up waiting for the IO to actually start. */ - btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL); + filemap_fdatawrite_range(inode->i_mapping, start, orig_end); - btrfs_wait_on_page_writeback_range(inode->i_mapping, - start >> PAGE_CACHE_SHIFT, - orig_end >> PAGE_CACHE_SHIFT); + filemap_fdatawait_range(inode->i_mapping, start, orig_end); end = orig_end; + found = 0; while (1) { ordered = btrfs_lookup_first_ordered_extent(inode, end); if (!ordered) @@ -514,6 +518,7 @@ again: btrfs_put_ordered_extent(ordered); break; } + found++; btrfs_start_ordered_extent(inode, ordered, 1); end = ordered->file_offset; btrfs_put_ordered_extent(ordered); @@ -521,8 +526,8 @@ again: break; end--; } - if (test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end, - EXTENT_ORDERED | EXTENT_DELALLOC, 0)) { + if (found || test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end, + EXTENT_DELALLOC, 0, NULL)) { schedule_timeout(1); goto again; } @@ -613,7 +618,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, */ if (test_range_bit(io_tree, disk_i_size, ordered->file_offset + ordered->len - 1, - EXTENT_DELALLOC, 0)) { + EXTENT_DELALLOC, 0, NULL)) { goto out; } /* @@ -664,7 +669,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, */ if (i_size_test > entry_end(ordered) && !test_range_bit(io_tree, entry_end(ordered), i_size_test - 1, - EXTENT_DELALLOC, 0)) { + EXTENT_DELALLOC, 0, NULL)) { new_i_size = min_t(u64, i_size_test, i_size_read(inode)); } BTRFS_I(inode)->disk_i_size = new_i_size; @@ -715,89 +720,6 @@ out: } -/** - * taken from mm/filemap.c because it isn't exported - * - * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range - * @mapping: address space structure to write - * @start: offset in bytes where the range starts - * @end: offset in bytes where the range ends (inclusive) - * @sync_mode: enable synchronous operation - * - * Start writeback against all of a mapping's dirty pages that lie - * within the byte offsets <start, end> inclusive. - * - * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as - * opposed to a regular memory cleansing writeback. The difference between - * these two operations is that if a dirty page/buffer is encountered, it must - * be waited upon, and not just skipped over. - */ -int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start, - loff_t end, int sync_mode) -{ - struct writeback_control wbc = { - .sync_mode = sync_mode, - .nr_to_write = mapping->nrpages * 2, - .range_start = start, - .range_end = end, - }; - return btrfs_writepages(mapping, &wbc); -} - -/** - * taken from mm/filemap.c because it isn't exported - * - * wait_on_page_writeback_range - wait for writeback to complete - * @mapping: target address_space - * @start: beginning page index - * @end: ending page index - * - * Wait for writeback to complete against pages indexed by start->end - * inclusive - */ -int btrfs_wait_on_page_writeback_range(struct address_space *mapping, - pgoff_t start, pgoff_t end) -{ - struct pagevec pvec; - int nr_pages; - int ret = 0; - pgoff_t index; - - if (end < start) - return 0; - - pagevec_init(&pvec, 0); - index = start; - while ((index <= end) && - (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_WRITEBACK, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) { - unsigned i; - - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; - - /* until radix tree lookup accepts end_index */ - if (page->index > end) - continue; - - wait_on_page_writeback(page); - if (PageError(page)) - ret = -EIO; - } - pagevec_release(&pvec); - cond_resched(); - } - - /* Check for outstanding write errors */ - if (test_and_clear_bit(AS_ENOSPC, &mapping->flags)) - ret = -ENOSPC; - if (test_and_clear_bit(AS_EIO, &mapping->flags)) - ret = -EIO; - - return ret; -} - /* * add a given inode to the list of inodes that must be fully on * disk before a transaction commit finishes. diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 3d31c88..f82e874 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -85,6 +85,9 @@ struct btrfs_ordered_extent { /* extent length on disk */ u64 disk_len; + /* number of bytes that still need writing */ + u64 bytes_left; + /* flags (described above) */ unsigned long flags; @@ -150,10 +153,6 @@ btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset); int btrfs_ordered_update_i_size(struct inode *inode, struct btrfs_ordered_extent *ordered); int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); -int btrfs_wait_on_page_writeback_range(struct address_space *mapping, - pgoff_t start, pgoff_t end); -int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start, - loff_t end, int sync_mode); int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only); int btrfs_run_ordered_operations(struct btrfs_root *root, int wait); int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c index 3c0d52a..79cba5f 100644 --- a/fs/btrfs/orphan.c +++ b/fs/btrfs/orphan.c @@ -65,3 +65,23 @@ out: btrfs_free_path(path); return ret; } + +int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset) +{ + struct btrfs_path *path; + struct btrfs_key key; + int ret; + + key.objectid = BTRFS_ORPHAN_OBJECTID; + key.type = BTRFS_ORPHAN_ITEM_KEY; + key.offset = offset; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + + btrfs_free_path(path); + return ret; +} diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index c04f7f2..cfcc93c 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -121,6 +121,15 @@ struct inodevec { int nr; }; +#define MAX_EXTENTS 128 + +struct file_extent_cluster { + u64 start; + u64 end; + u64 boundary[MAX_EXTENTS]; + unsigned int nr; +}; + struct reloc_control { /* block group to relocate */ struct btrfs_block_group_cache *block_group; @@ -2180,7 +2189,7 @@ static int tree_block_processed(u64 bytenr, u32 blocksize, struct reloc_control *rc) { if (test_range_bit(&rc->processed_blocks, bytenr, - bytenr + blocksize - 1, EXTENT_DIRTY, 1)) + bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL)) return 1; return 0; } @@ -2529,56 +2538,94 @@ out: } static noinline_for_stack -int relocate_inode_pages(struct inode *inode, u64 start, u64 len) +int setup_extent_mapping(struct inode *inode, u64 start, u64 end, + u64 block_start) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_map *em; + int ret = 0; + + em = alloc_extent_map(GFP_NOFS); + if (!em) + return -ENOMEM; + + em->start = start; + em->len = end + 1 - start; + em->block_len = em->len; + em->block_start = block_start; + em->bdev = root->fs_info->fs_devices->latest_bdev; + set_bit(EXTENT_FLAG_PINNED, &em->flags); + + lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + while (1) { + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + write_unlock(&em_tree->lock); + if (ret != -EEXIST) { + free_extent_map(em); + break; + } + btrfs_drop_extent_cache(inode, start, end, 0); + } + unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + return ret; +} + +static int relocate_file_extent_cluster(struct inode *inode, + struct file_extent_cluster *cluster) { u64 page_start; u64 page_end; - unsigned long i; - unsigned long first_index; + u64 offset = BTRFS_I(inode)->index_cnt; + unsigned long index; unsigned long last_index; - unsigned int total_read = 0; - unsigned int total_dirty = 0; + unsigned int dirty_page = 0; struct page *page; struct file_ra_state *ra; - struct btrfs_ordered_extent *ordered; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + int nr = 0; int ret = 0; + if (!cluster->nr) + return 0; + ra = kzalloc(sizeof(*ra), GFP_NOFS); if (!ra) return -ENOMEM; + index = (cluster->start - offset) >> PAGE_CACHE_SHIFT; + last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT; + mutex_lock(&inode->i_mutex); - first_index = start >> PAGE_CACHE_SHIFT; - last_index = (start + len - 1) >> PAGE_CACHE_SHIFT; - /* make sure the dirty trick played by the caller work */ - while (1) { - ret = invalidate_inode_pages2_range(inode->i_mapping, - first_index, last_index); - if (ret != -EBUSY) - break; - schedule_timeout(HZ/10); - } + i_size_write(inode, cluster->end + 1 - offset); + ret = setup_extent_mapping(inode, cluster->start - offset, + cluster->end - offset, cluster->start); if (ret) goto out_unlock; file_ra_state_init(ra, inode->i_mapping); - for (i = first_index ; i <= last_index; i++) { - if (total_read % ra->ra_pages == 0) { - btrfs_force_ra(inode->i_mapping, ra, NULL, i, - min(last_index, ra->ra_pages + i - 1)); - } - total_read++; -again: - if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode)) - BUG_ON(1); - page = grab_cache_page(inode->i_mapping, i); + WARN_ON(cluster->start != cluster->boundary[0]); + while (index <= last_index) { + page = find_lock_page(inode->i_mapping, index); if (!page) { - ret = -ENOMEM; - goto out_unlock; + page_cache_sync_readahead(inode->i_mapping, + ra, NULL, index, + last_index + 1 - index); + page = grab_cache_page(inode->i_mapping, index); + if (!page) { + ret = -ENOMEM; + goto out_unlock; + } + } + + if (PageReadahead(page)) { + page_cache_async_readahead(inode->i_mapping, + ra, NULL, page, index, + last_index + 1 - index); } + if (!PageUptodate(page)) { btrfs_readpage(NULL, page); lock_page(page); @@ -2589,75 +2636,79 @@ again: goto out_unlock; } } - wait_on_page_writeback(page); page_start = (u64)page->index << PAGE_CACHE_SHIFT; page_end = page_start + PAGE_CACHE_SIZE - 1; - lock_extent(io_tree, page_start, page_end, GFP_NOFS); - - ordered = btrfs_lookup_ordered_extent(inode, page_start); - if (ordered) { - unlock_extent(io_tree, page_start, page_end, GFP_NOFS); - unlock_page(page); - page_cache_release(page); - btrfs_start_ordered_extent(inode, ordered, 1); - btrfs_put_ordered_extent(ordered); - goto again; - } + + lock_extent(&BTRFS_I(inode)->io_tree, + page_start, page_end, GFP_NOFS); + set_page_extent_mapped(page); - if (i == first_index) - set_extent_bits(io_tree, page_start, page_end, + if (nr < cluster->nr && + page_start + offset == cluster->boundary[nr]) { + set_extent_bits(&BTRFS_I(inode)->io_tree, + page_start, page_end, EXTENT_BOUNDARY, GFP_NOFS); + nr++; + } btrfs_set_extent_delalloc(inode, page_start, page_end); set_page_dirty(page); - total_dirty++; + dirty_page++; - unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + unlock_extent(&BTRFS_I(inode)->io_tree, + page_start, page_end, GFP_NOFS); unlock_page(page); page_cache_release(page); + + index++; + if (nr < cluster->nr && + page_end + 1 + offset == cluster->boundary[nr]) { + balance_dirty_pages_ratelimited_nr(inode->i_mapping, + dirty_page); + dirty_page = 0; + } + } + if (dirty_page) { + balance_dirty_pages_ratelimited_nr(inode->i_mapping, + dirty_page); } + WARN_ON(nr != cluster->nr); out_unlock: mutex_unlock(&inode->i_mutex); kfree(ra); - balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty); return ret; } static noinline_for_stack -int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key) +int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key, + struct file_extent_cluster *cluster) { - struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; - struct extent_map *em; - u64 start = extent_key->objectid - BTRFS_I(inode)->index_cnt; - u64 end = start + extent_key->offset - 1; - - em = alloc_extent_map(GFP_NOFS); - em->start = start; - em->len = extent_key->offset; - em->block_len = extent_key->offset; - em->block_start = extent_key->objectid; - em->bdev = root->fs_info->fs_devices->latest_bdev; - set_bit(EXTENT_FLAG_PINNED, &em->flags); + int ret; - /* setup extent map to cheat btrfs_readpage */ - lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); - while (1) { - int ret; - spin_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em); - spin_unlock(&em_tree->lock); - if (ret != -EEXIST) { - free_extent_map(em); - break; - } - btrfs_drop_extent_cache(inode, start, end, 0); + if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) { + ret = relocate_file_extent_cluster(inode, cluster); + if (ret) + return ret; + cluster->nr = 0; } - unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); - return relocate_inode_pages(inode, start, extent_key->offset); + if (!cluster->nr) + cluster->start = extent_key->objectid; + else + BUG_ON(cluster->nr >= MAX_EXTENTS); + cluster->end = extent_key->objectid + extent_key->offset - 1; + cluster->boundary[cluster->nr] = extent_key->objectid; + cluster->nr++; + + if (cluster->nr >= MAX_EXTENTS) { + ret = relocate_file_extent_cluster(inode, cluster); + if (ret) + return ret; + cluster->nr = 0; + } + return 0; } #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 @@ -3203,10 +3254,12 @@ static int check_extent_flags(u64 flags) return 0; } + static noinline_for_stack int relocate_block_group(struct reloc_control *rc) { struct rb_root blocks = RB_ROOT; struct btrfs_key key; + struct file_extent_cluster *cluster; struct btrfs_trans_handle *trans = NULL; struct btrfs_path *path; struct btrfs_extent_item *ei; @@ -3216,10 +3269,17 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) int ret; int err = 0; + cluster = kzalloc(sizeof(*cluster), GFP_NOFS); + if (!cluster) + return -ENOMEM; + path = btrfs_alloc_path(); if (!path) return -ENOMEM; + rc->extents_found = 0; + rc->extents_skipped = 0; + rc->search_start = rc->block_group->key.objectid; clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY, GFP_NOFS); @@ -3306,14 +3366,15 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) } nr = trans->blocks_used; - btrfs_end_transaction_throttle(trans, rc->extent_root); + btrfs_end_transaction(trans, rc->extent_root); trans = NULL; btrfs_btree_balance_dirty(rc->extent_root, nr); if (rc->stage == MOVE_DATA_EXTENTS && (flags & BTRFS_EXTENT_FLAG_DATA)) { rc->found_file_extent = 1; - ret = relocate_data_extent(rc->data_inode, &key); + ret = relocate_data_extent(rc->data_inode, + &key, cluster); if (ret < 0) { err = ret; break; @@ -3328,6 +3389,14 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) btrfs_btree_balance_dirty(rc->extent_root, nr); } + if (!err) { + ret = relocate_file_extent_cluster(rc->data_inode, cluster); + if (ret < 0) + err = ret; + } + + kfree(cluster); + rc->create_reloc_root = 0; smp_mb(); @@ -3348,8 +3417,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) } static int __insert_orphan_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 objectid, u64 size) + struct btrfs_root *root, u64 objectid) { struct btrfs_path *path; struct btrfs_inode_item *item; @@ -3368,7 +3436,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans, item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item)); btrfs_set_inode_generation(leaf, item, 1); - btrfs_set_inode_size(leaf, item, size); + btrfs_set_inode_size(leaf, item, 0); btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS); btrfs_mark_buffer_dirty(leaf); @@ -3404,12 +3472,7 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, if (err) goto out; - err = __insert_orphan_inode(trans, root, objectid, group->key.offset); - BUG_ON(err); - - err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0, - group->key.offset, 0, group->key.offset, - 0, 0, 0); + err = __insert_orphan_inode(trans, root, objectid); BUG_ON(err); key.objectid = objectid; @@ -3455,7 +3518,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) BUG_ON(!rc->block_group); btrfs_init_workers(&rc->workers, "relocate", - fs_info->thread_pool_size); + fs_info->thread_pool_size, NULL); rc->extent_root = extent_root; btrfs_prepare_block_group_relocation(extent_root, rc->block_group); @@ -3475,14 +3538,15 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) btrfs_wait_ordered_extents(fs_info->tree_root, 0); while (1) { - mutex_lock(&fs_info->cleaner_mutex); - btrfs_clean_old_snapshots(fs_info->tree_root); - mutex_unlock(&fs_info->cleaner_mutex); - rc->extents_found = 0; rc->extents_skipped = 0; + mutex_lock(&fs_info->cleaner_mutex); + + btrfs_clean_old_snapshots(fs_info->tree_root); ret = relocate_block_group(rc); + + mutex_unlock(&fs_info->cleaner_mutex); if (ret < 0) { err = ret; break; @@ -3514,10 +3578,10 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) } } - filemap_fdatawrite_range(fs_info->btree_inode->i_mapping, - rc->block_group->key.objectid, - rc->block_group->key.objectid + - rc->block_group->key.offset - 1); + filemap_write_and_wait_range(fs_info->btree_inode->i_mapping, + rc->block_group->key.objectid, + rc->block_group->key.objectid + + rc->block_group->key.offset - 1); WARN_ON(rc->block_group->pinned > 0); WARN_ON(rc->block_group->reserved > 0); @@ -3530,6 +3594,26 @@ out: return err; } +static noinline_for_stack int mark_garbage_root(struct btrfs_root *root) +{ + struct btrfs_trans_handle *trans; + int ret; + + trans = btrfs_start_transaction(root->fs_info->tree_root, 1); + + memset(&root->root_item.drop_progress, 0, + sizeof(root->root_item.drop_progress)); + root->root_item.drop_level = 0; + btrfs_set_root_refs(&root->root_item, 0); + ret = btrfs_update_root(trans, root->fs_info->tree_root, + &root->root_key, &root->root_item); + BUG_ON(ret); + + ret = btrfs_end_transaction(trans, root->fs_info->tree_root); + BUG_ON(ret); + return 0; +} + /* * recover relocation interrupted by system crash. * @@ -3589,8 +3673,12 @@ int btrfs_recover_relocation(struct btrfs_root *root) fs_root = read_fs_root(root->fs_info, reloc_root->root_key.offset); if (IS_ERR(fs_root)) { - err = PTR_ERR(fs_root); - goto out; + ret = PTR_ERR(fs_root); + if (ret != -ENOENT) { + err = ret; + goto out; + } + mark_garbage_root(reloc_root); } } @@ -3613,7 +3701,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) mapping_tree_init(&rc->reloc_root_tree); INIT_LIST_HEAD(&rc->reloc_roots); btrfs_init_workers(&rc->workers, "relocate", - root->fs_info->thread_pool_size); + root->fs_info->thread_pool_size, NULL); rc->extent_root = root->fs_info->extent_root; set_reloc_control(rc); diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 0ddc6d6..9351428 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -94,17 +94,23 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, goto out; BUG_ON(ret == 0); + if (path->slots[0] == 0) { + ret = 1; + goto out; + } l = path->nodes[0]; - BUG_ON(path->slots[0] == 0); slot = path->slots[0] - 1; btrfs_item_key_to_cpu(l, &found_key, slot); - if (found_key.objectid != objectid) { + if (found_key.objectid != objectid || + found_key.type != BTRFS_ROOT_ITEM_KEY) { ret = 1; goto out; } - read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot), - sizeof(*item)); - memcpy(key, &found_key, sizeof(found_key)); + if (item) + read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot), + sizeof(*item)); + if (key) + memcpy(key, &found_key, sizeof(found_key)); ret = 0; out: btrfs_free_path(path); @@ -249,6 +255,59 @@ err: return ret; } +int btrfs_find_orphan_roots(struct btrfs_root *tree_root) +{ + struct extent_buffer *leaf; + struct btrfs_path *path; + struct btrfs_key key; + int err = 0; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = BTRFS_ORPHAN_OBJECTID; + key.type = BTRFS_ORPHAN_ITEM_KEY; + key.offset = 0; + + while (1) { + ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0); + if (ret < 0) { + err = ret; + break; + } + + leaf = path->nodes[0]; + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(tree_root, path); + if (ret < 0) + err = ret; + if (ret != 0) + break; + leaf = path->nodes[0]; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + btrfs_release_path(tree_root, path); + + if (key.objectid != BTRFS_ORPHAN_OBJECTID || + key.type != BTRFS_ORPHAN_ITEM_KEY) + break; + + ret = btrfs_find_dead_roots(tree_root, key.offset); + if (ret) { + err = ret; + break; + } + + key.offset++; + } + + btrfs_free_path(path); + return err; +} + /* drop the root item for 'key' from 'root' */ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_key *key) @@ -278,31 +337,57 @@ out: return ret; } -#if 0 /* this will get used when snapshot deletion is implemented */ int btrfs_del_root_ref(struct btrfs_trans_handle *trans, struct btrfs_root *tree_root, - u64 root_id, u8 type, u64 ref_id) + u64 root_id, u64 ref_id, u64 dirid, u64 *sequence, + const char *name, int name_len) + { + struct btrfs_path *path; + struct btrfs_root_ref *ref; + struct extent_buffer *leaf; struct btrfs_key key; + unsigned long ptr; + int err = 0; int ret; - struct btrfs_path *path; path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; key.objectid = root_id; - key.type = type; + key.type = BTRFS_ROOT_BACKREF_KEY; key.offset = ref_id; - +again: ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); - BUG_ON(ret); - - ret = btrfs_del_item(trans, tree_root, path); - BUG_ON(ret); + BUG_ON(ret < 0); + if (ret == 0) { + leaf = path->nodes[0]; + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_root_ref); + + WARN_ON(btrfs_root_ref_dirid(leaf, ref) != dirid); + WARN_ON(btrfs_root_ref_name_len(leaf, ref) != name_len); + ptr = (unsigned long)(ref + 1); + WARN_ON(memcmp_extent_buffer(leaf, name, ptr, name_len)); + *sequence = btrfs_root_ref_sequence(leaf, ref); + + ret = btrfs_del_item(trans, tree_root, path); + BUG_ON(ret); + } else + err = -ENOENT; + + if (key.type == BTRFS_ROOT_BACKREF_KEY) { + btrfs_release_path(tree_root, path); + key.objectid = ref_id; + key.type = BTRFS_ROOT_REF_KEY; + key.offset = root_id; + goto again; + } btrfs_free_path(path); - return ret; + return err; } -#endif int btrfs_find_root_ref(struct btrfs_root *tree_root, struct btrfs_path *path, @@ -319,7 +404,6 @@ int btrfs_find_root_ref(struct btrfs_root *tree_root, return ret; } - /* * add a btrfs_root_ref item. type is either BTRFS_ROOT_REF_KEY * or BTRFS_ROOT_BACKREF_KEY. @@ -335,8 +419,7 @@ int btrfs_find_root_ref(struct btrfs_root *tree_root, */ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, struct btrfs_root *tree_root, - u64 root_id, u8 type, u64 ref_id, - u64 dirid, u64 sequence, + u64 root_id, u64 ref_id, u64 dirid, u64 sequence, const char *name, int name_len) { struct btrfs_key key; @@ -346,13 +429,14 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; unsigned long ptr; - path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; key.objectid = root_id; - key.type = type; + key.type = BTRFS_ROOT_BACKREF_KEY; key.offset = ref_id; - +again: ret = btrfs_insert_empty_item(trans, tree_root, path, &key, sizeof(*ref) + name_len); BUG_ON(ret); @@ -366,6 +450,14 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, write_extent_buffer(leaf, name, ptr, name_len); btrfs_mark_buffer_dirty(leaf); + if (key.type == BTRFS_ROOT_BACKREF_KEY) { + btrfs_release_path(tree_root, path); + key.objectid = ref_id; + key.type = BTRFS_ROOT_REF_KEY; + key.offset = root_id; + goto again; + } + btrfs_free_path(path); - return ret; + return 0; } diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 2db17cd..752a546 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -66,7 +66,8 @@ enum { Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow, Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, - Opt_compress, Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_err, + Opt_compress, Opt_notreelog, Opt_ratio, Opt_flushoncommit, + Opt_discard, Opt_err, }; static match_table_t tokens = { @@ -88,6 +89,7 @@ static match_table_t tokens = { {Opt_notreelog, "notreelog"}, {Opt_flushoncommit, "flushoncommit"}, {Opt_ratio, "metadata_ratio=%d"}, + {Opt_discard, "discard"}, {Opt_err, NULL}, }; @@ -257,6 +259,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) info->metadata_ratio); } break; + case Opt_discard: + btrfs_set_opt(info->mount_opt, DISCARD); + break; default: break; } @@ -344,7 +349,9 @@ static int btrfs_fill_super(struct super_block *sb, sb->s_export_op = &btrfs_export_ops; sb->s_xattr = btrfs_xattr_handlers; sb->s_time_gran = 1; +#ifdef CONFIG_BTRFS_FS_POSIX_ACL sb->s_flags |= MS_POSIXACL; +#endif tree_root = open_ctree(sb, fs_devices, (char *)data); @@ -676,6 +683,7 @@ static int btrfs_unfreeze(struct super_block *sb) } static const struct super_operations btrfs_super_ops = { + .drop_inode = btrfs_drop_inode, .delete_inode = btrfs_delete_inode, .put_super = btrfs_put_super, .sync_fs = btrfs_sync_fs, diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index cdbb502..bca82a4 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -104,7 +104,6 @@ static noinline int record_root_in_trans(struct btrfs_trans_handle *trans, { if (root->ref_cows && root->last_trans < trans->transid) { WARN_ON(root == root->fs_info->extent_root); - WARN_ON(root->root_item.refs == 0); WARN_ON(root->commit_root != root->node); radix_tree_tag_set(&root->fs_info->fs_roots_radix, @@ -187,6 +186,9 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, h->alloc_exclude_start = 0; h->delayed_ref_updates = 0; + if (!current->journal_info) + current->journal_info = h; + root->fs_info->running_transaction->use_count++; record_root_in_trans(h, root); mutex_unlock(&root->fs_info->trans_mutex); @@ -318,6 +320,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, wake_up(&cur_trans->writer_wait); put_transaction(cur_trans); mutex_unlock(&info->trans_mutex); + + if (current->journal_info == trans) + current->journal_info = NULL; memset(trans, 0, sizeof(*trans)); kmem_cache_free(btrfs_trans_handle_cachep, trans); @@ -339,10 +344,10 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, /* * when btree blocks are allocated, they have some corresponding bits set for * them in one of two extent_io trees. This is used to make sure all of - * those extents are on disk for transaction or log commit + * those extents are sent to disk but does not wait on them */ -int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, - struct extent_io_tree *dirty_pages) +int btrfs_write_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages) { int ret; int err = 0; @@ -389,6 +394,29 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, page_cache_release(page); } } + if (err) + werr = err; + return werr; +} + +/* + * when btree blocks are allocated, they have some corresponding bits set for + * them in one of two extent_io trees. This is used to make sure all of + * those extents are on disk for transaction or log commit. We wait + * on all the pages and clear them from the dirty pages state tree + */ +int btrfs_wait_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages) +{ + int ret; + int err = 0; + int werr = 0; + struct page *page; + struct inode *btree_inode = root->fs_info->btree_inode; + u64 start = 0; + u64 end; + unsigned long index; + while (1) { ret = find_first_extent_bit(dirty_pages, 0, &start, &end, EXTENT_DIRTY); @@ -419,6 +447,22 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, return werr; } +/* + * when btree blocks are allocated, they have some corresponding bits set for + * them in one of two extent_io trees. This is used to make sure all of + * those extents are on disk for transaction or log commit + */ +int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages) +{ + int ret; + int ret2; + + ret = btrfs_write_marked_extents(root, dirty_pages); + ret2 = btrfs_wait_marked_extents(root, dirty_pages); + return ret || ret2; +} + int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { @@ -720,7 +764,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); key.objectid = objectid; - key.offset = 0; + /* record when the snapshot was created in key.offset */ + key.offset = trans->transid; btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); old = btrfs_lock_root_node(root); @@ -743,6 +788,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, memcpy(&pending->root_key, &key, sizeof(key)); fail: kfree(new_root_item); + btrfs_unreserve_metadata_space(root, 6); return ret; } @@ -778,24 +824,14 @@ static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info, ret = btrfs_update_inode(trans, parent_root, parent_inode); BUG_ON(ret); - /* add the backref first */ ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root, pending->root_key.objectid, - BTRFS_ROOT_BACKREF_KEY, parent_root->root_key.objectid, parent_inode->i_ino, index, pending->name, namelen); BUG_ON(ret); - /* now add the forward ref */ - ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root, - parent_root->root_key.objectid, - BTRFS_ROOT_REF_KEY, - pending->root_key.objectid, - parent_inode->i_ino, index, pending->name, - namelen); - inode = btrfs_lookup_dentry(parent_inode, pending->dentry); d_instantiate(pending->dentry, inode); fail: @@ -874,7 +910,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, unsigned long timeout = 1; struct btrfs_transaction *cur_trans; struct btrfs_transaction *prev_trans = NULL; - struct extent_io_tree *pinned_copy; DEFINE_WAIT(wait); int ret; int should_grow = 0; @@ -915,13 +950,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, return 0; } - pinned_copy = kmalloc(sizeof(*pinned_copy), GFP_NOFS); - if (!pinned_copy) - return -ENOMEM; - - extent_io_tree_init(pinned_copy, - root->fs_info->btree_inode->i_mapping, GFP_NOFS); - trans->transaction->in_commit = 1; trans->transaction->blocked = 1; if (cur_trans->list.prev != &root->fs_info->trans_list) { @@ -1019,6 +1047,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, ret = commit_cowonly_roots(trans, root); BUG_ON(ret); + btrfs_prepare_extent_commit(trans, root); + cur_trans = root->fs_info->running_transaction; spin_lock(&root->fs_info->new_trans_lock); root->fs_info->running_transaction = NULL; @@ -1042,8 +1072,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy, sizeof(root->fs_info->super_copy)); - btrfs_copy_pinned(root, pinned_copy); - trans->transaction->blocked = 0; wake_up(&root->fs_info->transaction_wait); @@ -1059,8 +1087,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, */ mutex_unlock(&root->fs_info->tree_log_mutex); - btrfs_finish_extent_commit(trans, root, pinned_copy); - kfree(pinned_copy); + btrfs_finish_extent_commit(trans, root); /* do the directory inserts of any pending snapshot creations */ finish_pending_snapshots(trans, root->fs_info); @@ -1078,6 +1105,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, mutex_unlock(&root->fs_info->trans_mutex); + if (current->journal_info == trans) + current->journal_info = NULL; + kmem_cache_free(btrfs_trans_handle_cachep, trans); return ret; } @@ -1096,8 +1126,13 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root) while (!list_empty(&list)) { root = list_entry(list.next, struct btrfs_root, root_list); - list_del_init(&root->root_list); - btrfs_drop_snapshot(root, 0); + list_del(&root->root_list); + + if (btrfs_header_backref_rev(root->node) < + BTRFS_MIXED_BACKREF_REV) + btrfs_drop_snapshot(root, 0); + else + btrfs_drop_snapshot(root, 1); } return 0; } diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 663c674..d4e3e7a 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -79,6 +79,7 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans, struct inode *inode) { BTRFS_I(inode)->last_trans = trans->transaction->transid; + BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; } int btrfs_end_transaction(struct btrfs_trans_handle *trans, @@ -107,5 +108,9 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages); +int btrfs_write_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages); +int btrfs_wait_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages); int btrfs_transaction_in_commit(struct btrfs_fs_info *info); #endif diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 30c0d45..741666a 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -137,11 +137,20 @@ static int start_log_trans(struct btrfs_trans_handle *trans, mutex_lock(&root->log_mutex); if (root->log_root) { + if (!root->log_start_pid) { + root->log_start_pid = current->pid; + root->log_multiple_pids = false; + } else if (root->log_start_pid != current->pid) { + root->log_multiple_pids = true; + } + root->log_batch++; atomic_inc(&root->log_writers); mutex_unlock(&root->log_mutex); return 0; } + root->log_multiple_pids = false; + root->log_start_pid = current->pid; mutex_lock(&root->fs_info->tree_log_mutex); if (!root->fs_info->log_root_tree) { ret = btrfs_init_log_root_tree(trans, root->fs_info); @@ -263,8 +272,8 @@ static int process_one_buffer(struct btrfs_root *log, struct walk_control *wc, u64 gen) { if (wc->pin) - btrfs_update_pinned_extents(log->fs_info->extent_root, - eb->start, eb->len, 1); + btrfs_pin_extent(log->fs_info->extent_root, + eb->start, eb->len, 0); if (btrfs_buffer_uptodate(eb, gen)) { if (wc->write) @@ -534,7 +543,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, saved_nbytes = inode_get_bytes(inode); /* drop any overlapping extents */ ret = btrfs_drop_extents(trans, root, inode, - start, extent_end, extent_end, start, &alloc_hint); + start, extent_end, extent_end, start, &alloc_hint, 1); BUG_ON(ret); if (found_type == BTRFS_FILE_EXTENT_REG || @@ -1971,6 +1980,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, int ret; struct btrfs_root *log = root->log_root; struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; + u64 log_transid = 0; mutex_lock(&root->log_mutex); index1 = root->log_transid % 2; @@ -1987,10 +1997,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, while (1) { unsigned long batch = root->log_batch; - mutex_unlock(&root->log_mutex); - schedule_timeout_uninterruptible(1); - mutex_lock(&root->log_mutex); - + if (root->log_multiple_pids) { + mutex_unlock(&root->log_mutex); + schedule_timeout_uninterruptible(1); + mutex_lock(&root->log_mutex); + } wait_for_writer(trans, root); if (batch == root->log_batch) break; @@ -2003,14 +2014,19 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, goto out; } - ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages); + /* we start IO on all the marked extents here, but we don't actually + * wait for them until later. + */ + ret = btrfs_write_marked_extents(log, &log->dirty_log_pages); BUG_ON(ret); btrfs_set_root_node(&log->root_item, log->node); root->log_batch = 0; + log_transid = root->log_transid; root->log_transid++; log->log_transid = root->log_transid; + root->log_start_pid = 0; smp_mb(); /* * log tree has been flushed to disk, new modifications of @@ -2036,6 +2052,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, index2 = log_root_tree->log_transid % 2; if (atomic_read(&log_root_tree->log_commit[index2])) { + btrfs_wait_marked_extents(log, &log->dirty_log_pages); wait_log_commit(trans, log_root_tree, log_root_tree->log_transid); mutex_unlock(&log_root_tree->log_mutex); @@ -2055,6 +2072,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * check the full commit flag again */ if (root->fs_info->last_trans_log_full_commit == trans->transid) { + btrfs_wait_marked_extents(log, &log->dirty_log_pages); mutex_unlock(&log_root_tree->log_mutex); ret = -EAGAIN; goto out_wake_log_root; @@ -2063,6 +2081,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, ret = btrfs_write_and_wait_marked_extents(log_root_tree, &log_root_tree->dirty_log_pages); BUG_ON(ret); + btrfs_wait_marked_extents(log, &log->dirty_log_pages); btrfs_set_super_log_root(&root->fs_info->super_for_commit, log_root_tree->node->start); @@ -2082,9 +2101,14 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * the running transaction open, so a full commit can't hop * in and cause problems either. */ - write_ctree_super(trans, root->fs_info->tree_root, 2); + write_ctree_super(trans, root->fs_info->tree_root, 1); ret = 0; + mutex_lock(&root->log_mutex); + if (root->last_log_commit < log_transid) + root->last_log_commit = log_transid; + mutex_unlock(&root->log_mutex); + out_wake_log_root: atomic_set(&log_root_tree->log_commit[index2], 0); smp_mb(); @@ -2841,7 +2865,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) break; - if (parent == sb->s_root) + if (IS_ROOT(parent)) break; parent = parent->d_parent; @@ -2852,6 +2876,21 @@ out: return ret; } +static int inode_in_log(struct btrfs_trans_handle *trans, + struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret = 0; + + mutex_lock(&root->log_mutex); + if (BTRFS_I(inode)->logged_trans == trans->transid && + BTRFS_I(inode)->last_sub_trans <= root->last_log_commit) + ret = 1; + mutex_unlock(&root->log_mutex); + return ret; +} + + /* * helper function around btrfs_log_inode to make sure newly created * parent directories also end up in the log. A minimal inode and backref @@ -2880,11 +2919,22 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_no_trans; } + if (root != BTRFS_I(inode)->root || + btrfs_root_refs(&root->root_item) == 0) { + ret = 1; + goto end_no_trans; + } + ret = check_parent_dirs_for_sync(trans, inode, parent, sb, last_committed); if (ret) goto end_no_trans; + if (inode_in_log(trans, inode)) { + ret = BTRFS_NO_LOG_SYNC; + goto end_no_trans; + } + start_log_trans(trans, root); ret = btrfs_log_inode(trans, root, inode, inode_only); @@ -2907,12 +2957,15 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, break; inode = parent->d_inode; + if (root != BTRFS_I(inode)->root) + break; + if (BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) { ret = btrfs_log_inode(trans, root, inode, inode_only); BUG_ON(ret); } - if (parent == sb->s_root) + if (IS_ROOT(parent)) break; parent = parent->d_parent; @@ -2951,7 +3004,6 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) struct btrfs_key tmp_key; struct btrfs_root *log; struct btrfs_fs_info *fs_info = log_root_tree->fs_info; - u64 highest_inode; struct walk_control wc = { .process_func = process_one_buffer, .stage = 0, @@ -3010,11 +3062,6 @@ again: path); BUG_ON(ret); } - ret = btrfs_find_highest_inode(wc.replay_dest, &highest_inode); - if (ret == 0) { - wc.replay_dest->highest_inode = highest_inode; - wc.replay_dest->last_inode_alloc = highest_inode; - } key.offset = found_key.offset - 1; wc.replay_dest->log_root = NULL; diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index d09c760..0776eac 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -19,6 +19,9 @@ #ifndef __TREE_LOG_ #define __TREE_LOG_ +/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */ +#define BTRFS_NO_LOG_SYNC 256 + int btrfs_sync_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 5cf405b..7eda483 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -276,7 +276,7 @@ loop_lock: * is now congested. Back off and let other work structs * run instead */ - if (pending && bdi_write_congested(bdi) && batch_run > 32 && + if (pending && bdi_write_congested(bdi) && batch_run > 8 && fs_info->fs_devices->open_devices > 1) { struct io_context *ioc; @@ -446,8 +446,10 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) goto error; device->name = kstrdup(orig_dev->name, GFP_NOFS); - if (!device->name) + if (!device->name) { + kfree(device); goto error; + } device->devid = orig_dev->devid; device->work.func = pending_bios_fn; @@ -719,10 +721,9 @@ error: * called very infrequently and that a given device has a small number * of extents */ -static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans, - struct btrfs_device *device, - u64 num_bytes, u64 *start, - u64 *max_avail) +int find_free_dev_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, u64 num_bytes, + u64 *start, u64 *max_avail) { struct btrfs_key key; struct btrfs_root *root = device->dev_root; @@ -1736,6 +1737,10 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, extent_root = root->fs_info->extent_root; em_tree = &root->fs_info->mapping_tree.map_tree; + ret = btrfs_can_relocate(extent_root, chunk_offset); + if (ret) + return -ENOSPC; + /* step one, relocate all the extents inside this chunk */ ret = btrfs_relocate_block_group(extent_root, chunk_offset); BUG_ON(ret); @@ -1749,9 +1754,9 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, * step two, delete the device extents and the * chunk tree entries */ - spin_lock(&em_tree->lock); + read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, chunk_offset, 1); - spin_unlock(&em_tree->lock); + read_unlock(&em_tree->lock); BUG_ON(em->start > chunk_offset || em->start + em->len < chunk_offset); @@ -1780,9 +1785,9 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, ret = btrfs_remove_block_group(trans, extent_root, chunk_offset); BUG_ON(ret); - spin_lock(&em_tree->lock); + write_lock(&em_tree->lock); remove_extent_mapping(em_tree, em); - spin_unlock(&em_tree->lock); + write_unlock(&em_tree->lock); kfree(map); em->bdev = NULL; @@ -1807,12 +1812,15 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root) struct btrfs_key found_key; u64 chunk_tree = chunk_root->root_key.objectid; u64 chunk_type; + bool retried = false; + int failed = 0; int ret; path = btrfs_alloc_path(); if (!path) return -ENOMEM; +again: key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; key.offset = (u64)-1; key.type = BTRFS_CHUNK_ITEM_KEY; @@ -1842,7 +1850,10 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root) ret = btrfs_relocate_chunk(chunk_root, chunk_tree, found_key.objectid, found_key.offset); - BUG_ON(ret); + if (ret == -ENOSPC) + failed++; + else if (ret) + BUG(); } if (found_key.offset == 0) @@ -1850,6 +1861,14 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root) key.offset = found_key.offset - 1; } ret = 0; + if (failed && !retried) { + failed = 0; + retried = true; + goto again; + } else if (failed && retried) { + WARN_ON(1); + ret = -ENOSPC; + } error: btrfs_free_path(path); return ret; @@ -1894,6 +1913,8 @@ int btrfs_balance(struct btrfs_root *dev_root) continue; ret = btrfs_shrink_device(device, old_size - size_to_free); + if (ret == -ENOSPC) + break; BUG_ON(ret); trans = btrfs_start_transaction(dev_root, 1); @@ -1938,9 +1959,8 @@ int btrfs_balance(struct btrfs_root *dev_root) chunk = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_chunk); - key.offset = found_key.offset; /* chunk zero is special */ - if (key.offset == 0) + if (found_key.offset == 0) break; btrfs_release_path(chunk_root, path); @@ -1948,7 +1968,8 @@ int btrfs_balance(struct btrfs_root *dev_root) chunk_root->root_key.objectid, found_key.objectid, found_key.offset); - BUG_ON(ret); + BUG_ON(ret && ret != -ENOSPC); + key.offset = found_key.offset - 1; } ret = 0; error: @@ -1974,10 +1995,13 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) u64 chunk_offset; int ret; int slot; + int failed = 0; + bool retried = false; struct extent_buffer *l; struct btrfs_key key; struct btrfs_super_block *super_copy = &root->fs_info->super_copy; u64 old_total = btrfs_super_total_bytes(super_copy); + u64 old_size = device->total_bytes; u64 diff = device->total_bytes - new_size; if (new_size >= device->total_bytes) @@ -1987,12 +2011,6 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) if (!path) return -ENOMEM; - trans = btrfs_start_transaction(root, 1); - if (!trans) { - ret = -ENOMEM; - goto done; - } - path->reada = 2; lock_chunks(root); @@ -2001,8 +2019,8 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) if (device->writeable) device->fs_devices->total_rw_bytes -= diff; unlock_chunks(root); - btrfs_end_transaction(trans, root); +again: key.objectid = device->devid; key.offset = (u64)-1; key.type = BTRFS_DEV_EXTENT_KEY; @@ -2017,6 +2035,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) goto done; if (ret) { ret = 0; + btrfs_release_path(root, path); break; } @@ -2024,14 +2043,18 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) slot = path->slots[0]; btrfs_item_key_to_cpu(l, &key, path->slots[0]); - if (key.objectid != device->devid) + if (key.objectid != device->devid) { + btrfs_release_path(root, path); break; + } dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); length = btrfs_dev_extent_length(l, dev_extent); - if (key.offset + length <= new_size) + if (key.offset + length <= new_size) { + btrfs_release_path(root, path); break; + } chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); @@ -2040,8 +2063,26 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid, chunk_offset); - if (ret) + if (ret && ret != -ENOSPC) goto done; + if (ret == -ENOSPC) + failed++; + key.offset -= 1; + } + + if (failed && !retried) { + failed = 0; + retried = true; + goto again; + } else if (failed && retried) { + ret = -ENOSPC; + lock_chunks(root); + + device->total_bytes = old_size; + if (device->writeable) + device->fs_devices->total_rw_bytes += diff; + unlock_chunks(root); + goto done; } /* Shrinking succeeded, else we would be at "done". */ @@ -2294,9 +2335,9 @@ again: em->block_len = em->len; em_tree = &extent_root->fs_info->mapping_tree.map_tree; - spin_lock(&em_tree->lock); + write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); - spin_unlock(&em_tree->lock); + write_unlock(&em_tree->lock); BUG_ON(ret); free_extent_map(em); @@ -2491,9 +2532,9 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset) int readonly = 0; int i; - spin_lock(&map_tree->map_tree.lock); + read_lock(&map_tree->map_tree.lock); em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); - spin_unlock(&map_tree->map_tree.lock); + read_unlock(&map_tree->map_tree.lock); if (!em) return 1; @@ -2518,11 +2559,11 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) struct extent_map *em; while (1) { - spin_lock(&tree->map_tree.lock); + write_lock(&tree->map_tree.lock); em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1); if (em) remove_extent_mapping(&tree->map_tree, em); - spin_unlock(&tree->map_tree.lock); + write_unlock(&tree->map_tree.lock); if (!em) break; kfree(em->bdev); @@ -2540,9 +2581,9 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len) struct extent_map_tree *em_tree = &map_tree->map_tree; int ret; - spin_lock(&em_tree->lock); + read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, logical, len); - spin_unlock(&em_tree->lock); + read_unlock(&em_tree->lock); BUG_ON(!em); BUG_ON(em->start > logical || em->start + em->len < logical); @@ -2604,9 +2645,9 @@ again: atomic_set(&multi->error, 0); } - spin_lock(&em_tree->lock); + read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, logical, *length); - spin_unlock(&em_tree->lock); + read_unlock(&em_tree->lock); if (!em && unplug_page) return 0; @@ -2763,9 +2804,9 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, u64 stripe_nr; int i, j, nr = 0; - spin_lock(&em_tree->lock); + read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, chunk_start, 1); - spin_unlock(&em_tree->lock); + read_unlock(&em_tree->lock); BUG_ON(!em || em->start != chunk_start); map = (struct map_lookup *)em->bdev; @@ -3053,9 +3094,9 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, logical = key->offset; length = btrfs_chunk_length(leaf, chunk); - spin_lock(&map_tree->map_tree.lock); + read_lock(&map_tree->map_tree.lock); em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); - spin_unlock(&map_tree->map_tree.lock); + read_unlock(&map_tree->map_tree.lock); /* already mapped? */ if (em && em->start <= logical && em->start + em->len > logical) { @@ -3114,9 +3155,9 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, map->stripes[i].dev->in_fs_metadata = 1; } - spin_lock(&map_tree->map_tree.lock); + write_lock(&map_tree->map_tree.lock); ret = add_extent_mapping(&map_tree->map_tree, em); - spin_unlock(&map_tree->map_tree.lock); + write_unlock(&map_tree->map_tree.lock); BUG_ON(ret); free_extent_map(em); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 5139a83..31b0fab 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -181,4 +181,7 @@ int btrfs_balance(struct btrfs_root *dev_root); void btrfs_unlock_volumes(void); void btrfs_lock_volumes(void); int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); +int find_free_dev_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, u64 num_bytes, + u64 *start, u64 *max_avail); #endif diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index a9d3bf4..b6dd596 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -260,7 +260,7 @@ err: * attributes are handled directly. */ struct xattr_handler *btrfs_xattr_handlers[] = { -#ifdef CONFIG_FS_POSIX_ACL +#ifdef CONFIG_BTRFS_FS_POSIX_ACL &btrfs_xattr_acl_access_handler, &btrfs_xattr_acl_default_handler, #endif |