diff options
author | Steve French <sfrench@us.ibm.com> | 2009-04-30 15:36:52 +0000 |
---|---|---|
committer | Steve French <sfrench@us.ibm.com> | 2009-04-30 15:36:52 +0000 |
commit | 912bc6ae3de99c7bada4577d4341ace4ee59b5be (patch) | |
tree | 28fd1a4bb9e4b05aa833285b46df169f12c0e24d /fs | |
parent | d37dc42ab6f040b8f0f2962ab219c5b2accf748d (diff) | |
parent | 091438dd5668396328a3419abcbc6591159eb8d1 (diff) | |
download | op-kernel-dev-912bc6ae3de99c7bada4577d4341ace4ee59b5be.zip op-kernel-dev-912bc6ae3de99c7bada4577d4341ace4ee59b5be.tar.gz |
Merge branch 'master' of /pub/scm/linux/kernel/git/torvalds/linux-2.6
Diffstat (limited to 'fs')
66 files changed, 1047 insertions, 1009 deletions
diff --git a/fs/autofs/dirhash.c b/fs/autofs/dirhash.c index bf8c8af..4eb4d8d 100644 --- a/fs/autofs/dirhash.c +++ b/fs/autofs/dirhash.c @@ -39,10 +39,12 @@ struct autofs_dir_ent *autofs_expire(struct super_block *sb, { struct autofs_dirhash *dh = &sbi->dirhash; struct autofs_dir_ent *ent; - struct dentry *dentry; unsigned long timeout = sbi->exp_timeout; while (1) { + struct path path; + int umount_ok; + if ( list_empty(&dh->expiry_head) || sbi->catatonic ) return NULL; /* No entries */ /* We keep the list sorted by last_usage and want old stuff */ @@ -57,17 +59,17 @@ struct autofs_dir_ent *autofs_expire(struct super_block *sb, return ent; /* Symlinks are always expirable */ /* Get the dentry for the autofs subdirectory */ - dentry = ent->dentry; + path.dentry = ent->dentry; - if ( !dentry ) { + if (!path.dentry) { /* Should only happen in catatonic mode */ printk("autofs: dentry == NULL but inode range is directory, entry %s\n", ent->name); autofs_delete_usage(ent); continue; } - if ( !dentry->d_inode ) { - dput(dentry); + if (!path.dentry->d_inode) { + dput(path.dentry); printk("autofs: negative dentry on expiry queue: %s\n", ent->name); autofs_delete_usage(ent); @@ -76,29 +78,29 @@ struct autofs_dir_ent *autofs_expire(struct super_block *sb, /* Make sure entry is mounted and unused; note that dentry will point to the mounted-on-top root. */ - if (!S_ISDIR(dentry->d_inode->i_mode)||!d_mountpoint(dentry)) { + if (!S_ISDIR(path.dentry->d_inode->i_mode) || + !d_mountpoint(path.dentry)) { DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name)); continue; } - mntget(mnt); - dget(dentry); - if (!follow_down(&mnt, &dentry)) { - dput(dentry); - mntput(mnt); + path.mnt = mnt; + path_get(&path); + if (!follow_down(&path.mnt, &path.dentry)) { + path_put(&path); DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name)); continue; } - while (d_mountpoint(dentry) && follow_down(&mnt, &dentry)) + while (d_mountpoint(path.dentry) && + follow_down(&path.mnt, &path.dentry)) ; - dput(dentry); + umount_ok = may_umount(path.mnt); + path_put(&path); - if ( may_umount(mnt) ) { - mntput(mnt); + if (umount_ok) { DPRINTK(("autofs: signaling expire on %s\n", ent->name)); return ent; /* Expirable! */ } DPRINTK(("autofs: didn't expire due to may_umount: %s\n", ent->name)); - mntput(mnt); } return NULL; /* No expirable entries */ } diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index 9e5ae8a..84168c0 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -54,11 +54,10 @@ static int check_name(const char *name) * Check a string doesn't overrun the chunk of * memory we copied from user land. */ -static int invalid_str(char *str, void *end) +static int invalid_str(char *str, size_t size) { - while ((void *) str <= end) - if (!*str++) - return 0; + if (memchr(str, 0, size)) + return 0; return -EINVAL; } @@ -138,8 +137,7 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param) } if (param->size > sizeof(*param)) { - err = invalid_str(param->path, - (void *) ((size_t) param + param->size)); + err = invalid_str(param->path, param->size - sizeof(*param)); if (err) { AUTOFS_WARN( "path string terminator missing for cmd(0x%08x)", @@ -488,7 +486,7 @@ static int autofs_dev_ioctl_requester(struct file *fp, } path = param->path; - devid = sbi->sb->s_dev; + devid = new_encode_dev(sbi->sb->s_dev); param->requester.uid = param->requester.gid = -1; @@ -175,14 +175,6 @@ struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_vec *bvl; /* - * If 'bs' is given, lookup the pool and do the mempool alloc. - * If not, this is a bio_kmalloc() allocation and just do a - * kzalloc() for the exact number of vecs right away. - */ - if (!bs) - bvl = kmalloc(nr * sizeof(struct bio_vec), gfp_mask); - - /* * see comment near bvec_array define! */ switch (nr) { @@ -260,21 +252,6 @@ void bio_free(struct bio *bio, struct bio_set *bs) mempool_free(p, bs->bio_pool); } -/* - * default destructor for a bio allocated with bio_alloc_bioset() - */ -static void bio_fs_destructor(struct bio *bio) -{ - bio_free(bio, fs_bio_set); -} - -static void bio_kmalloc_destructor(struct bio *bio) -{ - if (bio_has_allocated_vec(bio)) - kfree(bio->bi_io_vec); - kfree(bio); -} - void bio_init(struct bio *bio) { memset(bio, 0, sizeof(*bio)); @@ -301,21 +278,15 @@ void bio_init(struct bio *bio) **/ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) { + unsigned long idx = BIO_POOL_NONE; struct bio_vec *bvl = NULL; - struct bio *bio = NULL; - unsigned long idx = 0; - void *p = NULL; - - if (bs) { - p = mempool_alloc(bs->bio_pool, gfp_mask); - if (!p) - goto err; - bio = p + bs->front_pad; - } else { - bio = kmalloc(sizeof(*bio), gfp_mask); - if (!bio) - goto err; - } + struct bio *bio; + void *p; + + p = mempool_alloc(bs->bio_pool, gfp_mask); + if (unlikely(!p)) + return NULL; + bio = p + bs->front_pad; bio_init(bio); @@ -332,22 +303,50 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) nr_iovecs = bvec_nr_vecs(idx); } +out_set: bio->bi_flags |= idx << BIO_POOL_OFFSET; bio->bi_max_vecs = nr_iovecs; -out_set: bio->bi_io_vec = bvl; - return bio; err_free: - if (bs) - mempool_free(p, bs->bio_pool); - else - kfree(bio); -err: + mempool_free(p, bs->bio_pool); return NULL; } +static void bio_fs_destructor(struct bio *bio) +{ + bio_free(bio, fs_bio_set); +} + +/** + * bio_alloc - allocate a new bio, memory pool backed + * @gfp_mask: allocation mask to use + * @nr_iovecs: number of iovecs + * + * Allocate a new bio with @nr_iovecs bvecs. If @gfp_mask + * contains __GFP_WAIT, the allocation is guaranteed to succeed. + * + * RETURNS: + * Pointer to new bio on success, NULL on failure. + */ +struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs) +{ + struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set); + + if (bio) + bio->bi_destructor = bio_fs_destructor; + + return bio; +} + +static void bio_kmalloc_destructor(struct bio *bio) +{ + if (bio_integrity(bio)) + bio_integrity_free(bio); + kfree(bio); +} + /** * bio_alloc - allocate a bio for I/O * @gfp_mask: the GFP_ mask given to the slab allocator @@ -366,29 +365,20 @@ err: * do so can cause livelocks under memory pressure. * **/ -struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs) -{ - struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set); - - if (bio) - bio->bi_destructor = bio_fs_destructor; - - return bio; -} - -/* - * Like bio_alloc(), but doesn't use a mempool backing. This means that - * it CAN fail, but while bio_alloc() can only be used for allocations - * that have a short (finite) life span, bio_kmalloc() should be used - * for more permanent bio allocations (like allocating some bio's for - * initalization or setup purposes). - */ struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs) { - struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, NULL); + struct bio *bio; - if (bio) - bio->bi_destructor = bio_kmalloc_destructor; + bio = kmalloc(sizeof(struct bio) + nr_iovecs * sizeof(struct bio_vec), + gfp_mask); + if (unlikely(!bio)) + return NULL; + + bio_init(bio); + bio->bi_flags |= BIO_POOL_NONE << BIO_POOL_OFFSET; + bio->bi_max_vecs = nr_iovecs; + bio->bi_io_vec = bio->bi_inline_vecs; + bio->bi_destructor = bio_kmalloc_destructor; return bio; } @@ -827,12 +817,15 @@ struct bio *bio_copy_user_iov(struct request_queue *q, len += iov[i].iov_len; } + if (offset) + nr_pages++; + bmd = bio_alloc_map_data(nr_pages, iov_count, gfp_mask); if (!bmd) return ERR_PTR(-ENOMEM); ret = -ENOMEM; - bio = bio_alloc(gfp_mask, nr_pages); + bio = bio_kmalloc(gfp_mask, nr_pages); if (!bio) goto out_bmd; @@ -956,7 +949,7 @@ static struct bio *__bio_map_user_iov(struct request_queue *q, if (!nr_pages) return ERR_PTR(-EINVAL); - bio = bio_alloc(gfp_mask, nr_pages); + bio = bio_kmalloc(gfp_mask, nr_pages); if (!bio) return ERR_PTR(-ENOMEM); @@ -1140,7 +1133,7 @@ static struct bio *__bio_map_kern(struct request_queue *q, void *data, int offset, i; struct bio *bio; - bio = bio_alloc(gfp_mask, nr_pages); + bio = bio_kmalloc(gfp_mask, nr_pages); if (!bio) return ERR_PTR(-ENOMEM); diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 9adf5e4..9421284 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -1,25 +1,10 @@ -ifneq ($(KERNELRELEASE),) -# kbuild part of makefile obj-$(CONFIG_BTRFS_FS) := btrfs.o -btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ + +btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ file-item.o inode-item.o inode-map.o disk-io.o \ transaction.o inode.o file.o tree-defrag.o \ extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \ compression.o delayed-ref.o -else - -# Normal Makefile - -KERNELDIR := /lib/modules/`uname -r`/build -all: - $(MAKE) -C $(KERNELDIR) M=`pwd` CONFIG_BTRFS_FS=m modules - -modules_install: - $(MAKE) -C $(KERNELDIR) M=`pwd` modules_install -clean: - $(MAKE) -C $(KERNELDIR) M=`pwd` clean - -endif diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 7fdd184..cbba000 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -60,15 +60,20 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) return ERR_PTR(-EINVAL); } + /* Handle the cached NULL acl case without locking */ + acl = ACCESS_ONCE(*p_acl); + if (!acl) + return acl; + spin_lock(&inode->i_lock); - if (*p_acl != BTRFS_ACL_NOT_CACHED) - acl = posix_acl_dup(*p_acl); + acl = *p_acl; + if (acl != BTRFS_ACL_NOT_CACHED) + acl = posix_acl_dup(acl); spin_unlock(&inode->i_lock); - if (acl) + if (acl != BTRFS_ACL_NOT_CACHED) return acl; - size = __btrfs_getxattr(inode, name, "", 0); if (size > 0) { value = kzalloc(size, GFP_NOFS); @@ -80,9 +85,12 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) btrfs_update_cached_acl(inode, p_acl, acl); } kfree(value); - } else if (size == -ENOENT) { + } else if (size == -ENOENT || size == -ENODATA || size == 0) { + /* FIXME, who returns -ENOENT? I think nobody */ acl = NULL; btrfs_update_cached_acl(inode, p_acl, acl); + } else { + acl = ERR_PTR(-EIO); } return acl; diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 51bfdfc..502c3d6 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -25,6 +25,7 @@ #define WORK_QUEUED_BIT 0 #define WORK_DONE_BIT 1 #define WORK_ORDER_DONE_BIT 2 +#define WORK_HIGH_PRIO_BIT 3 /* * container for the kthread task pointer and the list of pending work @@ -36,6 +37,7 @@ struct btrfs_worker_thread { /* list of struct btrfs_work that are waiting for service */ struct list_head pending; + struct list_head prio_pending; /* list of worker threads from struct btrfs_workers */ struct list_head worker_list; @@ -103,10 +105,16 @@ static noinline int run_ordered_completions(struct btrfs_workers *workers, spin_lock_irqsave(&workers->lock, flags); - while (!list_empty(&workers->order_list)) { - work = list_entry(workers->order_list.next, - struct btrfs_work, order_list); - + while (1) { + if (!list_empty(&workers->prio_order_list)) { + work = list_entry(workers->prio_order_list.next, + struct btrfs_work, order_list); + } else if (!list_empty(&workers->order_list)) { + work = list_entry(workers->order_list.next, + struct btrfs_work, order_list); + } else { + break; + } if (!test_bit(WORK_DONE_BIT, &work->flags)) break; @@ -143,8 +151,14 @@ static int worker_loop(void *arg) do { spin_lock_irq(&worker->lock); again_locked: - while (!list_empty(&worker->pending)) { - cur = worker->pending.next; + while (1) { + if (!list_empty(&worker->prio_pending)) + cur = worker->prio_pending.next; + else if (!list_empty(&worker->pending)) + cur = worker->pending.next; + else + break; + work = list_entry(cur, struct btrfs_work, list); list_del(&work->list); clear_bit(WORK_QUEUED_BIT, &work->flags); @@ -163,7 +177,6 @@ again_locked: spin_lock_irq(&worker->lock); check_idle_worker(worker); - } if (freezing(current)) { worker->working = 0; @@ -178,7 +191,8 @@ again_locked: * jump_in? */ smp_mb(); - if (!list_empty(&worker->pending)) + if (!list_empty(&worker->pending) || + !list_empty(&worker->prio_pending)) continue; /* @@ -191,7 +205,8 @@ again_locked: */ schedule_timeout(1); smp_mb(); - if (!list_empty(&worker->pending)) + if (!list_empty(&worker->pending) || + !list_empty(&worker->prio_pending)) continue; if (kthread_should_stop()) @@ -200,7 +215,8 @@ again_locked: /* still no more work?, sleep for real */ spin_lock_irq(&worker->lock); set_current_state(TASK_INTERRUPTIBLE); - if (!list_empty(&worker->pending)) + if (!list_empty(&worker->pending) || + !list_empty(&worker->prio_pending)) goto again_locked; /* @@ -248,6 +264,7 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max) INIT_LIST_HEAD(&workers->worker_list); INIT_LIST_HEAD(&workers->idle_list); INIT_LIST_HEAD(&workers->order_list); + INIT_LIST_HEAD(&workers->prio_order_list); spin_lock_init(&workers->lock); workers->max_workers = max; workers->idle_thresh = 32; @@ -273,6 +290,7 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) } INIT_LIST_HEAD(&worker->pending); + INIT_LIST_HEAD(&worker->prio_pending); INIT_LIST_HEAD(&worker->worker_list); spin_lock_init(&worker->lock); atomic_set(&worker->num_pending, 0); @@ -396,7 +414,10 @@ int btrfs_requeue_work(struct btrfs_work *work) goto out; spin_lock_irqsave(&worker->lock, flags); - list_add_tail(&work->list, &worker->pending); + if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) + list_add_tail(&work->list, &worker->prio_pending); + else + list_add_tail(&work->list, &worker->pending); atomic_inc(&worker->num_pending); /* by definition we're busy, take ourselves off the idle @@ -422,6 +443,11 @@ out: return 0; } +void btrfs_set_work_high_prio(struct btrfs_work *work) +{ + set_bit(WORK_HIGH_PRIO_BIT, &work->flags); +} + /* * places a struct btrfs_work into the pending queue of one of the kthreads */ @@ -438,7 +464,12 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) worker = find_worker(workers); if (workers->ordered) { spin_lock_irqsave(&workers->lock, flags); - list_add_tail(&work->order_list, &workers->order_list); + if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) { + list_add_tail(&work->order_list, + &workers->prio_order_list); + } else { + list_add_tail(&work->order_list, &workers->order_list); + } spin_unlock_irqrestore(&workers->lock, flags); } else { INIT_LIST_HEAD(&work->order_list); @@ -446,7 +477,10 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) spin_lock_irqsave(&worker->lock, flags); - list_add_tail(&work->list, &worker->pending); + if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) + list_add_tail(&work->list, &worker->prio_pending); + else + list_add_tail(&work->list, &worker->pending); atomic_inc(&worker->num_pending); check_busy_worker(worker); diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index 31be4ed..1b511c1 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -85,6 +85,7 @@ struct btrfs_workers { * of work items waiting for completion */ struct list_head order_list; + struct list_head prio_order_list; /* lock for finding the next worker thread to queue on */ spinlock_t lock; @@ -98,4 +99,5 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers); int btrfs_stop_workers(struct btrfs_workers *workers); void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max); int btrfs_requeue_work(struct btrfs_work *work); +void btrfs_set_work_high_prio(struct btrfs_work *work); #endif diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index e5b2533..a99f1c2 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1325,12 +1325,12 @@ static noinline int reada_for_balance(struct btrfs_root *root, int ret = 0; int blocksize; - parent = path->nodes[level - 1]; + parent = path->nodes[level + 1]; if (!parent) return 0; nritems = btrfs_header_nritems(parent); - slot = path->slots[level]; + slot = path->slots[level + 1]; blocksize = btrfs_level_size(root, level); if (slot > 0) { @@ -1341,7 +1341,7 @@ static noinline int reada_for_balance(struct btrfs_root *root, block1 = 0; free_extent_buffer(eb); } - if (slot < nritems) { + if (slot + 1 < nritems) { block2 = btrfs_node_blockptr(parent, slot + 1); gen = btrfs_node_ptr_generation(parent, slot + 1); eb = btrfs_find_tree_block(root, block2, blocksize); @@ -1351,7 +1351,11 @@ static noinline int reada_for_balance(struct btrfs_root *root, } if (block1 || block2) { ret = -EAGAIN; + + /* release the whole path */ btrfs_release_path(root, path); + + /* read the blocks */ if (block1) readahead_tree_block(root, block1, blocksize, 0); if (block2) @@ -1361,7 +1365,7 @@ static noinline int reada_for_balance(struct btrfs_root *root, eb = read_tree_block(root, block1, blocksize, 0); free_extent_buffer(eb); } - if (block1) { + if (block2) { eb = read_tree_block(root, block2, blocksize, 0); free_extent_buffer(eb); } @@ -1481,12 +1485,15 @@ read_block_for_search(struct btrfs_trans_handle *trans, * of the btree by dropping locks before * we read. */ - btrfs_release_path(NULL, p); + btrfs_unlock_up_safe(p, level + 1); + btrfs_set_path_blocking(p); + if (tmp) free_extent_buffer(tmp); if (p->reada) reada_for_search(root, p, level, slot, key->objectid); + btrfs_release_path(NULL, p); tmp = read_tree_block(root, blocknr, blocksize, gen); if (tmp) free_extent_buffer(tmp); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index ad96495..4414a5d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -881,6 +881,9 @@ struct btrfs_fs_info { u64 metadata_alloc_profile; u64 system_alloc_profile; + unsigned data_chunk_allocations; + unsigned metadata_ratio; + void *bdev_holder; }; @@ -2174,7 +2177,8 @@ int btrfs_check_file(struct btrfs_root *root, struct inode *inode); extern struct file_operations btrfs_file_operations; int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, - u64 start, u64 end, u64 inline_limit, u64 *hint_block); + u64 start, u64 end, u64 locked_end, + u64 inline_limit, u64 *hint_block); int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, u64 start, u64 end); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 92caa80..0ff16d3 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -232,10 +232,14 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, memcpy(&found, result, csum_size); read_extent_buffer(buf, &val, 0, csum_size); - printk(KERN_INFO "btrfs: %s checksum verify failed " - "on %llu wanted %X found %X level %d\n", - root->fs_info->sb->s_id, - buf->start, val, found, btrfs_header_level(buf)); + if (printk_ratelimit()) { + printk(KERN_INFO "btrfs: %s checksum verify " + "failed on %llu wanted %X found %X " + "level %d\n", + root->fs_info->sb->s_id, + (unsigned long long)buf->start, val, found, + btrfs_header_level(buf)); + } if (result != (char *)&inline_result) kfree(result); return 1; @@ -268,10 +272,13 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, ret = 0; goto out; } - printk("parent transid verify failed on %llu wanted %llu found %llu\n", - (unsigned long long)eb->start, - (unsigned long long)parent_transid, - (unsigned long long)btrfs_header_generation(eb)); + if (printk_ratelimit()) { + printk("parent transid verify failed on %llu wanted %llu " + "found %llu\n", + (unsigned long long)eb->start, + (unsigned long long)parent_transid, + (unsigned long long)btrfs_header_generation(eb)); + } ret = 1; clear_extent_buffer_uptodate(io_tree, eb); out: @@ -415,9 +422,12 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, found_start = btrfs_header_bytenr(eb); if (found_start != start) { - printk(KERN_INFO "btrfs bad tree block start %llu %llu\n", - (unsigned long long)found_start, - (unsigned long long)eb->start); + if (printk_ratelimit()) { + printk(KERN_INFO "btrfs bad tree block start " + "%llu %llu\n", + (unsigned long long)found_start, + (unsigned long long)eb->start); + } ret = -EIO; goto err; } @@ -429,8 +439,10 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, goto err; } if (check_tree_block_fsid(root, eb)) { - printk(KERN_INFO "btrfs bad fsid on block %llu\n", - (unsigned long long)eb->start); + if (printk_ratelimit()) { + printk(KERN_INFO "btrfs bad fsid on block %llu\n", + (unsigned long long)eb->start); + } ret = -EIO; goto err; } @@ -579,19 +591,12 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, async->bio_flags = bio_flags; atomic_inc(&fs_info->nr_async_submits); + + if (rw & (1 << BIO_RW_SYNCIO)) + btrfs_set_work_high_prio(&async->work); + btrfs_queue_worker(&fs_info->workers, &async->work); -#if 0 - int limit = btrfs_async_submit_limit(fs_info); - if (atomic_read(&fs_info->nr_async_submits) > limit) { - wait_event_timeout(fs_info->async_submit_wait, - (atomic_read(&fs_info->nr_async_submits) < limit), - HZ/10); - wait_event_timeout(fs_info->async_submit_wait, - (atomic_read(&fs_info->nr_async_bios) < limit), - HZ/10); - } -#endif while (atomic_read(&fs_info->async_submit_draining) && atomic_read(&fs_info->nr_async_submits)) { wait_event(fs_info->async_submit_wait, @@ -656,6 +661,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 0); } + /* * kthread helpers are used to submit writes so that checksumming * can happen in parallel across all CPUs @@ -765,27 +771,6 @@ static void btree_invalidatepage(struct page *page, unsigned long offset) } } -#if 0 -static int btree_writepage(struct page *page, struct writeback_control *wbc) -{ - struct buffer_head *bh; - struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; - struct buffer_head *head; - if (!page_has_buffers(page)) { - create_empty_buffers(page, root->fs_info->sb->s_blocksize, - (1 << BH_Dirty)|(1 << BH_Uptodate)); - } - head = page_buffers(page); - bh = head; - do { - if (buffer_dirty(bh)) - csum_tree_block(root, bh, 0); - bh = bh->b_this_page; - } while (bh != head); - return block_write_full_page(page, btree_get_block, wbc); -} -#endif - static struct address_space_operations btree_aops = { .readpage = btree_readpage, .writepage = btree_writepage, @@ -1273,11 +1258,7 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits) int ret = 0; struct btrfs_device *device; struct backing_dev_info *bdi; -#if 0 - if ((bdi_bits & (1 << BDI_write_congested)) && - btrfs_congested_async(info, 0)) - return 1; -#endif + list_for_each_entry(device, &info->fs_devices->devices, dev_list) { if (!device->bdev) continue; @@ -1599,6 +1580,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->btree_inode = new_inode(sb); fs_info->btree_inode->i_ino = 1; fs_info->btree_inode->i_nlink = 1; + fs_info->metadata_ratio = 8; fs_info->thread_pool_size = min_t(unsigned long, num_online_cpus() + 2, 8); @@ -1689,7 +1671,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, if (features) { printk(KERN_ERR "BTRFS: couldn't mount because of " "unsupported optional features (%Lx).\n", - features); + (unsigned long long)features); err = -EINVAL; goto fail_iput; } @@ -1699,7 +1681,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, if (!(sb->s_flags & MS_RDONLY) && features) { printk(KERN_ERR "BTRFS: couldn't mount RDWR because of " "unsupported option features (%Lx).\n", - features); + (unsigned long long)features); err = -EINVAL; goto fail_iput; } @@ -2095,10 +2077,10 @@ static int write_dev_supers(struct btrfs_device *device, device->barriers = 0; get_bh(bh); lock_buffer(bh); - ret = submit_bh(WRITE, bh); + ret = submit_bh(WRITE_SYNC, bh); } } else { - ret = submit_bh(WRITE, bh); + ret = submit_bh(WRITE_SYNC, bh); } if (!ret && wait) { @@ -2291,7 +2273,7 @@ int close_ctree(struct btrfs_root *root) if (fs_info->delalloc_bytes) { printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n", - fs_info->delalloc_bytes); + (unsigned long long)fs_info->delalloc_bytes); } if (fs_info->total_ref_cache_size) { printk(KERN_INFO "btrfs: at umount reference cache size %llu\n", @@ -2328,16 +2310,6 @@ int close_ctree(struct btrfs_root *root) btrfs_stop_workers(&fs_info->endio_write_workers); btrfs_stop_workers(&fs_info->submit_workers); -#if 0 - while (!list_empty(&fs_info->hashers)) { - struct btrfs_hasher *hasher; - hasher = list_entry(fs_info->hashers.next, struct btrfs_hasher, - hashers); - list_del(&hasher->hashers); - crypto_free_hash(&fs_info->hash_tfm); - kfree(hasher); - } -#endif btrfs_close_devices(fs_info->fs_devices); btrfs_mapping_tree_free(&fs_info->mapping_tree); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 178df4c..e496644 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1844,10 +1844,14 @@ again: printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes" ", %llu bytes_used, %llu bytes_reserved, " "%llu bytes_pinned, %llu bytes_readonly, %llu may use" - "%llu total\n", bytes, data_sinfo->bytes_delalloc, - data_sinfo->bytes_used, data_sinfo->bytes_reserved, - data_sinfo->bytes_pinned, data_sinfo->bytes_readonly, - data_sinfo->bytes_may_use, data_sinfo->total_bytes); + "%llu total\n", (unsigned long long)bytes, + (unsigned long long)data_sinfo->bytes_delalloc, + (unsigned long long)data_sinfo->bytes_used, + (unsigned long long)data_sinfo->bytes_reserved, + (unsigned long long)data_sinfo->bytes_pinned, + (unsigned long long)data_sinfo->bytes_readonly, + (unsigned long long)data_sinfo->bytes_may_use, + (unsigned long long)data_sinfo->total_bytes); return -ENOSPC; } data_sinfo->bytes_may_use += bytes; @@ -1918,15 +1922,29 @@ void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, spin_unlock(&info->lock); } +static void force_metadata_allocation(struct btrfs_fs_info *info) +{ + struct list_head *head = &info->space_info; + struct btrfs_space_info *found; + + rcu_read_lock(); + list_for_each_entry_rcu(found, head, list) { + if (found->flags & BTRFS_BLOCK_GROUP_METADATA) + found->force_alloc = 1; + } + rcu_read_unlock(); +} + static int do_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 alloc_bytes, u64 flags, int force) { struct btrfs_space_info *space_info; + struct btrfs_fs_info *fs_info = extent_root->fs_info; u64 thresh; int ret = 0; - mutex_lock(&extent_root->fs_info->chunk_mutex); + mutex_lock(&fs_info->chunk_mutex); flags = btrfs_reduce_alloc_profile(extent_root, flags); @@ -1958,6 +1976,18 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, } spin_unlock(&space_info->lock); + /* + * if we're doing a data chunk, go ahead and make sure that + * we keep a reasonable number of metadata chunks allocated in the + * FS as well. + */ + if (flags & BTRFS_BLOCK_GROUP_DATA) { + fs_info->data_chunk_allocations++; + if (!(fs_info->data_chunk_allocations % + fs_info->metadata_ratio)) + force_metadata_allocation(fs_info); + } + ret = btrfs_alloc_chunk(trans, extent_root, flags); if (ret) space_info->full = 1; @@ -2798,9 +2828,12 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes) info->bytes_pinned - info->bytes_reserved), (info->full) ? "" : "not "); printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu," - " may_use=%llu, used=%llu\n", info->total_bytes, - info->bytes_pinned, info->bytes_delalloc, info->bytes_may_use, - info->bytes_used); + " may_use=%llu, used=%llu\n", + (unsigned long long)info->total_bytes, + (unsigned long long)info->bytes_pinned, + (unsigned long long)info->bytes_delalloc, + (unsigned long long)info->bytes_may_use, + (unsigned long long)info->bytes_used); down_read(&info->groups_sem); list_for_each_entry(cache, &info->block_groups, list) { diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index eb2bee8..fe9eb99 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -17,12 +17,6 @@ #include "ctree.h" #include "btrfs_inode.h" -/* temporary define until extent_map moves out of btrfs */ -struct kmem_cache *btrfs_cache_create(const char *name, size_t size, - unsigned long extra_flags, - void (*ctor)(void *, struct kmem_cache *, - unsigned long)); - static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; @@ -50,20 +44,23 @@ struct extent_page_data { /* tells writepage not to lock the state bits for this range * it still does the unlocking */ - int extent_locked; + unsigned int extent_locked:1; + + /* tells the submit_bio code to use a WRITE_SYNC */ + unsigned int sync_io:1; }; int __init extent_io_init(void) { - extent_state_cache = btrfs_cache_create("extent_state", - sizeof(struct extent_state), 0, - NULL); + extent_state_cache = kmem_cache_create("extent_state", + sizeof(struct extent_state), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!extent_state_cache) return -ENOMEM; - extent_buffer_cache = btrfs_cache_create("extent_buffers", - sizeof(struct extent_buffer), 0, - NULL); + extent_buffer_cache = kmem_cache_create("extent_buffers", + sizeof(struct extent_buffer), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!extent_buffer_cache) goto free_state_cache; return 0; @@ -1404,69 +1401,6 @@ out: return total_bytes; } -#if 0 -/* - * helper function to lock both pages and extents in the tree. - * pages must be locked first. - */ -static int lock_range(struct extent_io_tree *tree, u64 start, u64 end) -{ - unsigned long index = start >> PAGE_CACHE_SHIFT; - unsigned long end_index = end >> PAGE_CACHE_SHIFT; - struct page *page; - int err; - - while (index <= end_index) { - page = grab_cache_page(tree->mapping, index); - if (!page) { - err = -ENOMEM; - goto failed; - } - if (IS_ERR(page)) { - err = PTR_ERR(page); - goto failed; - } - index++; - } - lock_extent(tree, start, end, GFP_NOFS); - return 0; - -failed: - /* - * we failed above in getting the page at 'index', so we undo here - * up to but not including the page at 'index' - */ - end_index = index; - index = start >> PAGE_CACHE_SHIFT; - while (index < end_index) { - page = find_get_page(tree->mapping, index); - unlock_page(page); - page_cache_release(page); - index++; - } - return err; -} - -/* - * helper function to unlock both pages and extents in the tree. - */ -static int unlock_range(struct extent_io_tree *tree, u64 start, u64 end) -{ - unsigned long index = start >> PAGE_CACHE_SHIFT; - unsigned long end_index = end >> PAGE_CACHE_SHIFT; - struct page *page; - - while (index <= end_index) { - page = find_get_page(tree->mapping, index); - unlock_page(page); - page_cache_release(page); - index++; - } - unlock_extent(tree, start, end, GFP_NOFS); - return 0; -} -#endif - /* * set the private field for a given byte offset in the tree. If there isn't * an extent_state there already, this does nothing. @@ -2101,6 +2035,16 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page, return ret; } +static noinline void update_nr_written(struct page *page, + struct writeback_control *wbc, + unsigned long nr_written) +{ + wbc->nr_to_write -= nr_written; + if (wbc->range_cyclic || (wbc->nr_to_write > 0 && + wbc->range_start == 0 && wbc->range_end == LLONG_MAX)) + page->mapping->writeback_index = page->index + nr_written; +} + /* * the writepage semantics are similar to regular writepage. extent * records are inserted to lock ranges in the tree, and as dirty areas @@ -2136,8 +2080,14 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, u64 delalloc_end; int page_started; int compressed; + int write_flags; unsigned long nr_written = 0; + if (wbc->sync_mode == WB_SYNC_ALL) + write_flags = WRITE_SYNC_PLUG; + else + write_flags = WRITE; + WARN_ON(!PageLocked(page)); pg_offset = i_size & (PAGE_CACHE_SIZE - 1); if (page->index > end_index || @@ -2164,6 +2114,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, delalloc_end = 0; page_started = 0; if (!epd->extent_locked) { + /* + * make sure the wbc mapping index is at least updated + * to this page. + */ + update_nr_written(page, wbc, 0); + while (delalloc_end < page_end) { nr_delalloc = find_lock_delalloc_range(inode, tree, page, @@ -2185,7 +2141,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, */ if (page_started) { ret = 0; - goto update_nr_written; + /* + * we've unlocked the page, so we can't update + * the mapping's writeback index, just update + * nr_to_write. + */ + wbc->nr_to_write -= nr_written; + goto done_unlocked; } } lock_extent(tree, start, page_end, GFP_NOFS); @@ -2198,13 +2160,18 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, if (ret == -EAGAIN) { unlock_extent(tree, start, page_end, GFP_NOFS); redirty_page_for_writepage(wbc, page); + update_nr_written(page, wbc, nr_written); unlock_page(page); ret = 0; - goto update_nr_written; + goto done_unlocked; } } - nr_written++; + /* + * we don't want to touch the inode after unlocking the page, + * so we update the mapping writeback index now + */ + update_nr_written(page, wbc, nr_written + 1); end = page_end; if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) @@ -2314,9 +2281,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, (unsigned long long)end); } - ret = submit_extent_page(WRITE, tree, page, sector, - iosize, pg_offset, bdev, - &epd->bio, max_nr, + ret = submit_extent_page(write_flags, tree, page, + sector, iosize, pg_offset, + bdev, &epd->bio, max_nr, end_bio_extent_writepage, 0, 0, 0); if (ret) @@ -2336,11 +2303,8 @@ done: unlock_extent(tree, unlock_start, page_end, GFP_NOFS); unlock_page(page); -update_nr_written: - wbc->nr_to_write -= nr_written; - if (wbc->range_cyclic || (wbc->nr_to_write > 0 && - wbc->range_start == 0 && wbc->range_end == LLONG_MAX)) - page->mapping->writeback_index = page->index + nr_written; +done_unlocked: + return 0; } @@ -2460,15 +2424,23 @@ retry: return ret; } -static noinline void flush_write_bio(void *data) +static void flush_epd_write_bio(struct extent_page_data *epd) { - struct extent_page_data *epd = data; if (epd->bio) { - submit_one_bio(WRITE, epd->bio, 0, 0); + if (epd->sync_io) + submit_one_bio(WRITE_SYNC, epd->bio, 0, 0); + else + submit_one_bio(WRITE, epd->bio, 0, 0); epd->bio = NULL; } } +static noinline void flush_write_bio(void *data) +{ + struct extent_page_data *epd = data; + flush_epd_write_bio(epd); +} + int extent_write_full_page(struct extent_io_tree *tree, struct page *page, get_extent_t *get_extent, struct writeback_control *wbc) @@ -2480,23 +2452,22 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page, .tree = tree, .get_extent = get_extent, .extent_locked = 0, + .sync_io = wbc->sync_mode == WB_SYNC_ALL, }; struct writeback_control wbc_writepages = { .bdi = wbc->bdi, - .sync_mode = WB_SYNC_NONE, + .sync_mode = wbc->sync_mode, .older_than_this = NULL, .nr_to_write = 64, .range_start = page_offset(page) + PAGE_CACHE_SIZE, .range_end = (loff_t)-1, }; - ret = __extent_writepage(page, wbc, &epd); extent_write_cache_pages(tree, mapping, &wbc_writepages, __extent_writepage, &epd, flush_write_bio); - if (epd.bio) - submit_one_bio(WRITE, epd.bio, 0, 0); + flush_epd_write_bio(&epd); return ret; } @@ -2515,6 +2486,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, .tree = tree, .get_extent = get_extent, .extent_locked = 1, + .sync_io = mode == WB_SYNC_ALL, }; struct writeback_control wbc_writepages = { .bdi = inode->i_mapping->backing_dev_info, @@ -2540,8 +2512,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, start += PAGE_CACHE_SIZE; } - if (epd.bio) - submit_one_bio(WRITE, epd.bio, 0, 0); + flush_epd_write_bio(&epd); return ret; } @@ -2556,13 +2527,13 @@ int extent_writepages(struct extent_io_tree *tree, .tree = tree, .get_extent = get_extent, .extent_locked = 0, + .sync_io = wbc->sync_mode == WB_SYNC_ALL, }; ret = extent_write_cache_pages(tree, mapping, wbc, __extent_writepage, &epd, flush_write_bio); - if (epd.bio) - submit_one_bio(WRITE, epd.bio, 0, 0); + flush_epd_write_bio(&epd); return ret; } diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index b187917..30c9365 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -6,19 +6,14 @@ #include <linux/hardirq.h> #include "extent_map.h" -/* temporary define until extent_map moves out of btrfs */ -struct kmem_cache *btrfs_cache_create(const char *name, size_t size, - unsigned long extra_flags, - void (*ctor)(void *, struct kmem_cache *, - unsigned long)); static struct kmem_cache *extent_map_cache; int __init extent_map_init(void) { - extent_map_cache = btrfs_cache_create("extent_map", - sizeof(struct extent_map), 0, - NULL); + extent_map_cache = kmem_cache_create("extent_map", + sizeof(struct extent_map), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!extent_map_cache) return -ENOMEM; return 0; @@ -43,7 +38,6 @@ void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask) tree->map.rb_node = NULL; spin_lock_init(&tree->lock); } -EXPORT_SYMBOL(extent_map_tree_init); /** * alloc_extent_map - allocate new extent map structure @@ -64,7 +58,6 @@ struct extent_map *alloc_extent_map(gfp_t mask) atomic_set(&em->refs, 1); return em; } -EXPORT_SYMBOL(alloc_extent_map); /** * free_extent_map - drop reference count of an extent_map @@ -83,7 +76,6 @@ void free_extent_map(struct extent_map *em) kmem_cache_free(extent_map_cache, em); } } -EXPORT_SYMBOL(free_extent_map); static struct rb_node *tree_insert(struct rb_root *root, u64 offset, struct rb_node *node) @@ -264,7 +256,6 @@ int add_extent_mapping(struct extent_map_tree *tree, out: return ret; } -EXPORT_SYMBOL(add_extent_mapping); /* simple helper to do math around the end of an extent, handling wrap */ static u64 range_end(u64 start, u64 len) @@ -326,7 +317,6 @@ found: out: return em; } -EXPORT_SYMBOL(lookup_extent_mapping); /** * remove_extent_mapping - removes an extent_map from the extent tree @@ -346,4 +336,3 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) em->in_tree = 0; return ret; } -EXPORT_SYMBOL(remove_extent_mapping); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 9c9fb46..1d51dc3 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -272,83 +272,6 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, return 0; } -int btrfs_check_file(struct btrfs_root *root, struct inode *inode) -{ - return 0; -#if 0 - struct btrfs_path *path; - struct btrfs_key found_key; - struct extent_buffer *leaf; - struct btrfs_file_extent_item *extent; - u64 last_offset = 0; - int nritems; - int slot; - int found_type; - int ret; - int err = 0; - u64 extent_end = 0; - - path = btrfs_alloc_path(); - ret = btrfs_lookup_file_extent(NULL, root, path, inode->i_ino, - last_offset, 0); - while (1) { - nritems = btrfs_header_nritems(path->nodes[0]); - if (path->slots[0] >= nritems) { - ret = btrfs_next_leaf(root, path); - if (ret) - goto out; - nritems = btrfs_header_nritems(path->nodes[0]); - } - slot = path->slots[0]; - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &found_key, slot); - if (found_key.objectid != inode->i_ino) - break; - if (found_key.type != BTRFS_EXTENT_DATA_KEY) - goto out; - - if (found_key.offset < last_offset) { - WARN_ON(1); - btrfs_print_leaf(root, leaf); - printk(KERN_ERR "inode %lu found offset %llu " - "expected %llu\n", inode->i_ino, - (unsigned long long)found_key.offset, - (unsigned long long)last_offset); - err = 1; - goto out; - } - extent = btrfs_item_ptr(leaf, slot, - struct btrfs_file_extent_item); - found_type = btrfs_file_extent_type(leaf, extent); - if (found_type == BTRFS_FILE_EXTENT_REG) { - extent_end = found_key.offset + - btrfs_file_extent_num_bytes(leaf, extent); - } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { - struct btrfs_item *item; - item = btrfs_item_nr(leaf, slot); - extent_end = found_key.offset + - btrfs_file_extent_inline_len(leaf, extent); - extent_end = (extent_end + root->sectorsize - 1) & - ~((u64)root->sectorsize - 1); - } - last_offset = extent_end; - path->slots[0]++; - } - if (0 && last_offset < inode->i_size) { - WARN_ON(1); - btrfs_print_leaf(root, leaf); - printk(KERN_ERR "inode %lu found offset %llu size %llu\n", - inode->i_ino, (unsigned long long)last_offset, - (unsigned long long)inode->i_size); - err = 1; - - } -out: - btrfs_free_path(path); - return err; -#endif -} - /* * this is very complex, but the basic idea is to drop all extents * in the range start - end. hint_block is filled in with a block number @@ -363,15 +286,16 @@ out: */ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, - u64 start, u64 end, u64 inline_limit, u64 *hint_byte) + u64 start, u64 end, u64 locked_end, + u64 inline_limit, u64 *hint_byte) { u64 extent_end = 0; - u64 locked_end = end; u64 search_start = start; u64 leaf_start; u64 ram_bytes = 0; u64 orig_parent = 0; u64 disk_bytenr = 0; + u64 orig_locked_end = locked_end; u8 compression; u8 encryption; u16 other_encoding = 0; @@ -684,11 +608,10 @@ next_slot: } out: btrfs_free_path(path); - if (locked_end > end) { - unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1, - GFP_NOFS); + if (locked_end > orig_locked_end) { + unlock_extent(&BTRFS_I(inode)->io_tree, orig_locked_end, + locked_end - 1, GFP_NOFS); } - btrfs_check_file(root, inode); return ret; } @@ -830,7 +753,7 @@ again: ret = btrfs_del_items(trans, root, path, del_slot, del_nr); BUG_ON(ret); - goto done; + goto release; } else if (split == start) { if (locked_end < extent_end) { ret = try_lock_extent(&BTRFS_I(inode)->io_tree, @@ -926,6 +849,8 @@ again: } done: btrfs_mark_buffer_dirty(leaf); + +release: btrfs_release_path(root, path); if (split_end && split == start) { split = end; @@ -1131,7 +1056,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, if (will_write) { btrfs_fdatawrite_range(inode->i_mapping, pos, pos + write_bytes - 1, - WB_SYNC_NONE); + WB_SYNC_ALL); } else { balance_dirty_pages_ratelimited_nr(inode->i_mapping, num_pages); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 768b952..0bc9365 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -332,13 +332,17 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, printk(KERN_ERR "couldn't find space %llu to free\n", (unsigned long long)offset); printk(KERN_ERR "cached is %d, offset %llu bytes %llu\n", - block_group->cached, block_group->key.objectid, - block_group->key.offset); + block_group->cached, + (unsigned long long)block_group->key.objectid, + (unsigned long long)block_group->key.offset); btrfs_dump_free_space(block_group, bytes); } else if (info) { printk(KERN_ERR "hmm, found offset=%llu bytes=%llu, " "but wanted offset=%llu bytes=%llu\n", - info->offset, info->bytes, offset, bytes); + (unsigned long long)info->offset, + (unsigned long long)info->bytes, + (unsigned long long)offset, + (unsigned long long)bytes); } WARN_ON(1); } @@ -357,8 +361,9 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, info = rb_entry(n, struct btrfs_free_space, offset_index); if (info->bytes >= bytes) count++; - printk(KERN_ERR "entry offset %llu, bytes %llu\n", info->offset, - info->bytes); + printk(KERN_ERR "entry offset %llu, bytes %llu\n", + (unsigned long long)info->offset, + (unsigned long long)info->bytes); } printk(KERN_INFO "%d blocks of free space at or bigger than bytes is" "\n", count); diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index cc7334d..9abbced 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -79,7 +79,7 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans, } path = btrfs_alloc_path(); BUG_ON(!path); - search_start = max(search_start, BTRFS_FIRST_FREE_OBJECTID); + search_start = max(search_start, (u64)BTRFS_FIRST_FREE_OBJECTID); search_key.objectid = search_start; search_key.type = 0; search_key.offset = 0; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a0d1dd4..90c23eb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -70,7 +70,6 @@ static struct extent_io_ops btrfs_extent_io_ops; static struct kmem_cache *btrfs_inode_cachep; struct kmem_cache *btrfs_trans_handle_cachep; struct kmem_cache *btrfs_transaction_cachep; -struct kmem_cache *btrfs_bit_radix_cachep; struct kmem_cache *btrfs_path_cachep; #define S_SHIFT 12 @@ -234,7 +233,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, } ret = btrfs_drop_extents(trans, root, inode, start, - aligned_end, start, &hint_byte); + aligned_end, aligned_end, start, &hint_byte); BUG_ON(ret); if (isize > actual_end) @@ -1439,6 +1438,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, struct inode *inode, u64 file_pos, u64 disk_bytenr, u64 disk_num_bytes, u64 num_bytes, u64 ram_bytes, + u64 locked_end, u8 compression, u8 encryption, u16 other_encoding, int extent_type) { @@ -1455,7 +1455,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, path->leave_spinning = 1; ret = btrfs_drop_extents(trans, root, inode, file_pos, - file_pos + num_bytes, file_pos, &hint); + file_pos + num_bytes, locked_end, + file_pos, &hint); BUG_ON(ret); ins.objectid = inode->i_ino; @@ -1590,6 +1591,8 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) ordered_extent->disk_len, ordered_extent->len, ordered_extent->len, + ordered_extent->file_offset + + ordered_extent->len, compressed, 0, 0, BTRFS_FILE_EXTENT_REG); BUG_ON(ret); @@ -1819,10 +1822,12 @@ good: return 0; zeroit: - printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u " - "private %llu\n", page->mapping->host->i_ino, - (unsigned long long)start, csum, - (unsigned long long)private); + if (printk_ratelimit()) { + printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u " + "private %llu\n", page->mapping->host->i_ino, + (unsigned long long)start, csum, + (unsigned long long)private); + } memset(kaddr + offset, 1, end - start + 1); flush_dcache_page(page); kunmap_atomic(kaddr, KM_USER0); @@ -2011,6 +2016,57 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) } /* + * very simple check to peek ahead in the leaf looking for xattrs. If we + * don't find any xattrs, we know there can't be any acls. + * + * slot is the slot the inode is in, objectid is the objectid of the inode + */ +static noinline int acls_after_inode_item(struct extent_buffer *leaf, + int slot, u64 objectid) +{ + u32 nritems = btrfs_header_nritems(leaf); + struct btrfs_key found_key; + int scanned = 0; + + slot++; + while (slot < nritems) { + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + /* we found a different objectid, there must not be acls */ + if (found_key.objectid != objectid) + return 0; + + /* we found an xattr, assume we've got an acl */ + if (found_key.type == BTRFS_XATTR_ITEM_KEY) + return 1; + + /* + * we found a key greater than an xattr key, there can't + * be any acls later on + */ + if (found_key.type > BTRFS_XATTR_ITEM_KEY) + return 0; + + slot++; + scanned++; + + /* + * it goes inode, inode backrefs, xattrs, extents, + * so if there are a ton of hard links to an inode there can + * be a lot of backrefs. Don't waste time searching too hard, + * this is just an optimization + */ + if (scanned >= 8) + break; + } + /* we hit the end of the leaf before we found an xattr or + * something larger than an xattr. We have to assume the inode + * has acls + */ + return 1; +} + +/* * read an inode from the btree into the in-memory inode */ void btrfs_read_locked_inode(struct inode *inode) @@ -2021,6 +2077,7 @@ void btrfs_read_locked_inode(struct inode *inode) struct btrfs_timespec *tspec; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key location; + int maybe_acls; u64 alloc_group_block; u32 rdev; int ret; @@ -2067,6 +2124,16 @@ void btrfs_read_locked_inode(struct inode *inode) alloc_group_block = btrfs_inode_block_group(leaf, inode_item); + /* + * try to precache a NULL acl entry for files that don't have + * any xattrs or acls + */ + maybe_acls = acls_after_inode_item(leaf, path->slots[0], inode->i_ino); + if (!maybe_acls) { + BTRFS_I(inode)->i_acl = NULL; + BTRFS_I(inode)->i_default_acl = NULL; + } + BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0, alloc_group_block, 0); btrfs_free_path(path); @@ -2877,6 +2944,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) err = btrfs_drop_extents(trans, root, inode, cur_offset, cur_offset + hole_size, + block_end, cur_offset, &hint_byte); if (err) break; @@ -3041,8 +3109,8 @@ static noinline void init_btrfs_i(struct inode *inode) { struct btrfs_inode *bi = BTRFS_I(inode); - bi->i_acl = NULL; - bi->i_default_acl = NULL; + bi->i_acl = BTRFS_ACL_NOT_CACHED; + bi->i_default_acl = BTRFS_ACL_NOT_CACHED; bi->generation = 0; bi->sequence = 0; @@ -4634,47 +4702,36 @@ void btrfs_destroy_cachep(void) kmem_cache_destroy(btrfs_trans_handle_cachep); if (btrfs_transaction_cachep) kmem_cache_destroy(btrfs_transaction_cachep); - if (btrfs_bit_radix_cachep) - kmem_cache_destroy(btrfs_bit_radix_cachep); if (btrfs_path_cachep) kmem_cache_destroy(btrfs_path_cachep); } -struct kmem_cache *btrfs_cache_create(const char *name, size_t size, - unsigned long extra_flags, - void (*ctor)(void *)) -{ - return kmem_cache_create(name, size, 0, (SLAB_RECLAIM_ACCOUNT | - SLAB_MEM_SPREAD | extra_flags), ctor); -} - int btrfs_init_cachep(void) { - btrfs_inode_cachep = btrfs_cache_create("btrfs_inode_cache", - sizeof(struct btrfs_inode), - 0, init_once); + btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache", + sizeof(struct btrfs_inode), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once); if (!btrfs_inode_cachep) goto fail; - btrfs_trans_handle_cachep = - btrfs_cache_create("btrfs_trans_handle_cache", - sizeof(struct btrfs_trans_handle), - 0, NULL); + + btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache", + sizeof(struct btrfs_trans_handle), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!btrfs_trans_handle_cachep) goto fail; - btrfs_transaction_cachep = btrfs_cache_create("btrfs_transaction_cache", - sizeof(struct btrfs_transaction), - 0, NULL); + + btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache", + sizeof(struct btrfs_transaction), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!btrfs_transaction_cachep) goto fail; - btrfs_path_cachep = btrfs_cache_create("btrfs_path_cache", - sizeof(struct btrfs_path), - 0, NULL); + + btrfs_path_cachep = kmem_cache_create("btrfs_path_cache", + sizeof(struct btrfs_path), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!btrfs_path_cachep) goto fail; - btrfs_bit_radix_cachep = btrfs_cache_create("btrfs_radix", 256, - SLAB_DESTROY_BY_RCU, NULL); - if (!btrfs_bit_radix_cachep) - goto fail; + return 0; fail: btrfs_destroy_cachep(); @@ -4970,10 +5027,10 @@ out_fail: return err; } -static int prealloc_file_range(struct inode *inode, u64 start, u64 end, - u64 alloc_hint, int mode) +static int prealloc_file_range(struct btrfs_trans_handle *trans, + struct inode *inode, u64 start, u64 end, + u64 locked_end, u64 alloc_hint, int mode) { - struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key ins; u64 alloc_size; @@ -4981,10 +5038,6 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end, u64 num_bytes = end - start; int ret = 0; - trans = btrfs_join_transaction(root, 1); - BUG_ON(!trans); - btrfs_set_trans_block_group(trans, inode); - while (num_bytes > 0) { alloc_size = min(num_bytes, root->fs_info->max_extent); ret = btrfs_reserve_extent(trans, root, alloc_size, @@ -4997,7 +5050,8 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end, ret = insert_reserved_file_extent(trans, inode, cur_offset, ins.objectid, ins.offset, ins.offset, - ins.offset, 0, 0, 0, + ins.offset, locked_end, + 0, 0, 0, BTRFS_FILE_EXTENT_PREALLOC); BUG_ON(ret); num_bytes -= ins.offset; @@ -5015,7 +5069,6 @@ out: BUG_ON(ret); } - btrfs_end_transaction(trans, root); return ret; } @@ -5027,13 +5080,21 @@ static long btrfs_fallocate(struct inode *inode, int mode, u64 alloc_start; u64 alloc_end; u64 alloc_hint = 0; + u64 locked_end; u64 mask = BTRFS_I(inode)->root->sectorsize - 1; struct extent_map *em; + struct btrfs_trans_handle *trans; int ret; alloc_start = offset & ~mask; alloc_end = (offset + len + mask) & ~mask; + /* + * wait for ordered IO before we have any locks. We'll loop again + * below with the locks held. + */ + btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start); + mutex_lock(&inode->i_mutex); if (alloc_start > inode->i_size) { ret = btrfs_cont_expand(inode, alloc_start); @@ -5041,10 +5102,21 @@ static long btrfs_fallocate(struct inode *inode, int mode, goto out; } + locked_end = alloc_end - 1; while (1) { struct btrfs_ordered_extent *ordered; - lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, - alloc_end - 1, GFP_NOFS); + + trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1); + if (!trans) { + ret = -EIO; + goto out; + } + + /* the extent lock is ordered inside the running + * transaction + */ + lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, + GFP_NOFS); ordered = btrfs_lookup_first_ordered_extent(inode, alloc_end - 1); if (ordered && @@ -5052,7 +5124,13 @@ static long btrfs_fallocate(struct inode *inode, int mode, ordered->file_offset < alloc_end) { btrfs_put_ordered_extent(ordered); unlock_extent(&BTRFS_I(inode)->io_tree, - alloc_start, alloc_end - 1, GFP_NOFS); + alloc_start, locked_end, GFP_NOFS); + btrfs_end_transaction(trans, BTRFS_I(inode)->root); + + /* + * we can't wait on the range with the transaction + * running or with the extent lock held + */ btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start); } else { @@ -5070,8 +5148,9 @@ static long btrfs_fallocate(struct inode *inode, int mode, last_byte = min(extent_map_end(em), alloc_end); last_byte = (last_byte + mask) & ~mask; if (em->block_start == EXTENT_MAP_HOLE) { - ret = prealloc_file_range(inode, cur_offset, - last_byte, alloc_hint, mode); + ret = prealloc_file_range(trans, inode, cur_offset, + last_byte, locked_end + 1, + alloc_hint, mode); if (ret < 0) { free_extent_map(em); break; @@ -5087,8 +5166,10 @@ static long btrfs_fallocate(struct inode *inode, int mode, break; } } - unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, alloc_end - 1, + unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, GFP_NOFS); + + btrfs_end_transaction(trans, BTRFS_I(inode)->root); out: mutex_unlock(&inode->i_mutex); return ret; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 7594bec..5e94ea6 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -461,15 +461,9 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); - - if (!vol_args) - return -ENOMEM; - - if (copy_from_user(vol_args, arg, sizeof(*vol_args))) { - ret = -EFAULT; - goto out; - } + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; namelen = strlen(vol_args->name); @@ -483,11 +477,13 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg) *devstr = '\0'; devstr = vol_args->name; devid = simple_strtoull(devstr, &end, 10); - printk(KERN_INFO "resizing devid %llu\n", devid); + printk(KERN_INFO "resizing devid %llu\n", + (unsigned long long)devid); } device = btrfs_find_device(root, devid, NULL, NULL); if (!device) { - printk(KERN_INFO "resizer unable to find device %llu\n", devid); + printk(KERN_INFO "resizer unable to find device %llu\n", + (unsigned long long)devid); ret = -EINVAL; goto out_unlock; } @@ -545,7 +541,6 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg) out_unlock: mutex_unlock(&root->fs_info->volume_mutex); -out: kfree(vol_args); return ret; } @@ -565,15 +560,9 @@ static noinline int btrfs_ioctl_snap_create(struct file *file, if (root->fs_info->sb->s_flags & MS_RDONLY) return -EROFS; - vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); - - if (!vol_args) - return -ENOMEM; - - if (copy_from_user(vol_args, arg, sizeof(*vol_args))) { - ret = -EFAULT; - goto out; - } + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; namelen = strlen(vol_args->name); @@ -675,19 +664,13 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); - if (!vol_args) - return -ENOMEM; - - if (copy_from_user(vol_args, arg, sizeof(*vol_args))) { - ret = -EFAULT; - goto out; - } vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; ret = btrfs_init_new_device(root, vol_args->name); -out: kfree(vol_args); return ret; } @@ -703,19 +686,13 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) if (root->fs_info->sb->s_flags & MS_RDONLY) return -EROFS; - vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); - if (!vol_args) - return -ENOMEM; - - if (copy_from_user(vol_args, arg, sizeof(*vol_args))) { - ret = -EFAULT; - goto out; - } vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; ret = btrfs_rm_device(root, vol_args->name); -out: kfree(vol_args); return ret; } @@ -830,7 +807,8 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, BUG_ON(!trans); /* punch hole in destination first */ - btrfs_drop_extents(trans, root, inode, off, off+len, 0, &hint_byte); + btrfs_drop_extents(trans, root, inode, off, off + len, + off + len, 0, &hint_byte); /* clone data */ key.objectid = src->i_ino; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 53c87b1..d6f0806 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -489,7 +489,7 @@ again: /* start IO across the range first to instantiate any delalloc * extents */ - btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_NONE); + btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL); /* The compression code will leave pages locked but return from * writepage without setting the page writeback. Starting again diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 9744af9..3536bdb 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -68,7 +68,7 @@ enum { Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow, Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_notreelog, - Opt_flushoncommit, Opt_err, + Opt_ratio, Opt_flushoncommit, Opt_err, }; static match_table_t tokens = { @@ -87,6 +87,7 @@ static match_table_t tokens = { {Opt_noacl, "noacl"}, {Opt_notreelog, "notreelog"}, {Opt_flushoncommit, "flushoncommit"}, + {Opt_ratio, "metadata_ratio=%d"}, {Opt_err, NULL}, }; @@ -195,7 +196,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) info->max_extent = max_t(u64, info->max_extent, root->sectorsize); printk(KERN_INFO "btrfs: max_extent at %llu\n", - info->max_extent); + (unsigned long long)info->max_extent); } break; case Opt_max_inline: @@ -210,7 +211,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) root->sectorsize); } printk(KERN_INFO "btrfs: max_inline at %llu\n", - info->max_inline); + (unsigned long long)info->max_inline); } break; case Opt_alloc_start: @@ -220,7 +221,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) kfree(num); printk(KERN_INFO "btrfs: allocations start at %llu\n", - info->alloc_start); + (unsigned long long)info->alloc_start); } break; case Opt_noacl: @@ -234,6 +235,15 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) printk(KERN_INFO "btrfs: turning on flush-on-commit\n"); btrfs_set_opt(info->mount_opt, FLUSHONCOMMIT); break; + case Opt_ratio: + intarg = 0; + match_int(&args[0], &intarg); + if (intarg) { + info->metadata_ratio = intarg; + printk(KERN_INFO "btrfs: metadata ratio %d\n", + info->metadata_ratio); + } + break; default: break; } @@ -410,11 +420,14 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs) if (btrfs_test_opt(root, NOBARRIER)) seq_puts(seq, ",nobarrier"); if (info->max_extent != (u64)-1) - seq_printf(seq, ",max_extent=%llu", info->max_extent); + seq_printf(seq, ",max_extent=%llu", + (unsigned long long)info->max_extent); if (info->max_inline != 8192 * 1024) - seq_printf(seq, ",max_inline=%llu", info->max_inline); + seq_printf(seq, ",max_inline=%llu", + (unsigned long long)info->max_inline); if (info->alloc_start != 0) - seq_printf(seq, ",alloc_start=%llu", info->alloc_start); + seq_printf(seq, ",alloc_start=%llu", + (unsigned long long)info->alloc_start); if (info->thread_pool_size != min_t(unsigned long, num_online_cpus() + 2, 8)) seq_printf(seq, ",thread_pool=%d", info->thread_pool_size); @@ -635,14 +648,9 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - vol = kmalloc(sizeof(*vol), GFP_KERNEL); - if (!vol) - return -ENOMEM; - - if (copy_from_user(vol, (void __user *)arg, sizeof(*vol))) { - ret = -EFAULT; - goto out; - } + vol = memdup_user((void __user *)arg, sizeof(*vol)); + if (IS_ERR(vol)) + return PTR_ERR(vol); switch (cmd) { case BTRFS_IOC_SCAN_DEV: @@ -650,7 +658,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, &btrfs_fs_type, &fs_devices); break; } -out: + kfree(vol); return ret; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 2869b33..01b1436 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -687,7 +687,13 @@ static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info) prepare_to_wait(&info->transaction_wait, &wait, TASK_UNINTERRUPTIBLE); mutex_unlock(&info->trans_mutex); + + atomic_dec(&info->throttles); + wake_up(&info->transaction_throttle); + schedule(); + + atomic_inc(&info->throttles); mutex_lock(&info->trans_mutex); finish_wait(&info->transaction_wait, &wait); } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 25f20ea..db5e212 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -536,7 +536,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, saved_nbytes = inode_get_bytes(inode); /* drop any overlapping extents */ ret = btrfs_drop_extents(trans, root, inode, - start, extent_end, start, &alloc_hint); + start, extent_end, extent_end, start, &alloc_hint); BUG_ON(ret); if (found_type == BTRFS_FILE_EXTENT_REG || diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index e0913e4..5f01dad 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -125,6 +125,20 @@ static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) return NULL; } +static void requeue_list(struct btrfs_pending_bios *pending_bios, + struct bio *head, struct bio *tail) +{ + + struct bio *old_head; + + old_head = pending_bios->head; + pending_bios->head = head; + if (pending_bios->tail) + tail->bi_next = old_head; + else + pending_bios->tail = tail; +} + /* * we try to collect pending bios for a device so we don't get a large * number of procs sending bios down to the same device. This greatly @@ -141,10 +155,12 @@ static noinline int run_scheduled_bios(struct btrfs_device *device) struct bio *pending; struct backing_dev_info *bdi; struct btrfs_fs_info *fs_info; + struct btrfs_pending_bios *pending_bios; struct bio *tail; struct bio *cur; int again = 0; - unsigned long num_run = 0; + unsigned long num_run; + unsigned long num_sync_run; unsigned long limit; unsigned long last_waited = 0; @@ -153,20 +169,30 @@ static noinline int run_scheduled_bios(struct btrfs_device *device) limit = btrfs_async_submit_limit(fs_info); limit = limit * 2 / 3; + /* we want to make sure that every time we switch from the sync + * list to the normal list, we unplug + */ + num_sync_run = 0; + loop: spin_lock(&device->io_lock); + num_run = 0; loop_lock: + /* take all the bios off the list at once and process them * later on (without the lock held). But, remember the * tail and other pointers so the bios can be properly reinserted * into the list if we hit congestion */ - pending = device->pending_bios; - tail = device->pending_bio_tail; + if (device->pending_sync_bios.head) + pending_bios = &device->pending_sync_bios; + else + pending_bios = &device->pending_bios; + + pending = pending_bios->head; + tail = pending_bios->tail; WARN_ON(pending && !tail); - device->pending_bios = NULL; - device->pending_bio_tail = NULL; /* * if pending was null this time around, no bios need processing @@ -176,16 +202,41 @@ loop_lock: * device->running_pending is used to synchronize with the * schedule_bio code. */ - if (pending) { - again = 1; - device->running_pending = 1; - } else { + if (device->pending_sync_bios.head == NULL && + device->pending_bios.head == NULL) { again = 0; device->running_pending = 0; + } else { + again = 1; + device->running_pending = 1; } + + pending_bios->head = NULL; + pending_bios->tail = NULL; + spin_unlock(&device->io_lock); + /* + * if we're doing the regular priority list, make sure we unplug + * for any high prio bios we've sent down + */ + if (pending_bios == &device->pending_bios && num_sync_run > 0) { + num_sync_run = 0; + blk_run_backing_dev(bdi, NULL); + } + while (pending) { + + rmb(); + if (pending_bios != &device->pending_sync_bios && + device->pending_sync_bios.head && + num_run > 16) { + cond_resched(); + spin_lock(&device->io_lock); + requeue_list(pending_bios, pending, tail); + goto loop_lock; + } + cur = pending; pending = pending->bi_next; cur->bi_next = NULL; @@ -196,10 +247,18 @@ loop_lock: wake_up(&fs_info->async_submit_wait); BUG_ON(atomic_read(&cur->bi_cnt) == 0); - bio_get(cur); submit_bio(cur->bi_rw, cur); - bio_put(cur); num_run++; + if (bio_sync(cur)) + num_sync_run++; + + if (need_resched()) { + if (num_sync_run) { + blk_run_backing_dev(bdi, NULL); + num_sync_run = 0; + } + cond_resched(); + } /* * we made progress, there is more work to do and the bdi @@ -208,7 +267,6 @@ loop_lock: */ if (pending && bdi_write_congested(bdi) && num_run > 16 && fs_info->fs_devices->open_devices > 1) { - struct bio *old_head; struct io_context *ioc; ioc = current->io_context; @@ -233,17 +291,17 @@ loop_lock: * against it before looping */ last_waited = ioc->last_waited; + if (need_resched()) { + if (num_sync_run) { + blk_run_backing_dev(bdi, NULL); + num_sync_run = 0; + } + cond_resched(); + } continue; } spin_lock(&device->io_lock); - - old_head = device->pending_bios; - device->pending_bios = pending; - if (device->pending_bio_tail) - tail->bi_next = old_head; - else - device->pending_bio_tail = tail; - + requeue_list(pending_bios, pending, tail); device->running_pending = 1; spin_unlock(&device->io_lock); @@ -251,11 +309,18 @@ loop_lock: goto done; } } + + if (num_sync_run) { + num_sync_run = 0; + blk_run_backing_dev(bdi, NULL); + } + + cond_resched(); if (again) goto loop; spin_lock(&device->io_lock); - if (device->pending_bios) + if (device->pending_bios.head || device->pending_sync_bios.head) goto loop_lock; spin_unlock(&device->io_lock); @@ -1478,7 +1543,7 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, btrfs_set_device_io_align(leaf, dev_item, device->io_align); btrfs_set_device_io_width(leaf, dev_item, device->io_width); btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); - btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes); + btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes); btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); btrfs_mark_buffer_dirty(leaf); @@ -1875,14 +1940,6 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) device->total_bytes = new_size; if (device->writeable) device->fs_devices->total_rw_bytes -= diff; - ret = btrfs_update_device(trans, device); - if (ret) { - unlock_chunks(root); - btrfs_end_transaction(trans, root); - goto done; - } - WARN_ON(diff > old_total); - btrfs_set_super_total_bytes(super_copy, old_total - diff); unlock_chunks(root); btrfs_end_transaction(trans, root); @@ -1914,7 +1971,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) length = btrfs_dev_extent_length(l, dev_extent); if (key.offset + length <= new_size) - goto done; + break; chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); @@ -1927,6 +1984,26 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) goto done; } + /* Shrinking succeeded, else we would be at "done". */ + trans = btrfs_start_transaction(root, 1); + if (!trans) { + ret = -ENOMEM; + goto done; + } + lock_chunks(root); + + device->disk_total_bytes = new_size; + /* Now btrfs_update_device() will change the on-disk size. */ + ret = btrfs_update_device(trans, device); + if (ret) { + unlock_chunks(root); + btrfs_end_transaction(trans, root); + goto done; + } + WARN_ON(diff > old_total); + btrfs_set_super_total_bytes(super_copy, old_total - diff); + unlock_chunks(root); + btrfs_end_transaction(trans, root); done: btrfs_free_path(path); return ret; @@ -2497,7 +2574,7 @@ again: max_errors = 1; } } - if (multi_ret && rw == WRITE && + if (multi_ret && (rw & (1 << BIO_RW)) && stripes_allocated < stripes_required) { stripes_allocated = map->num_stripes; free_extent_map(em); @@ -2762,6 +2839,7 @@ static noinline int schedule_bio(struct btrfs_root *root, int rw, struct bio *bio) { int should_queue = 1; + struct btrfs_pending_bios *pending_bios; /* don't bother with additional async steps for reads, right now */ if (!(rw & (1 << BIO_RW))) { @@ -2783,13 +2861,17 @@ static noinline int schedule_bio(struct btrfs_root *root, bio->bi_rw |= rw; spin_lock(&device->io_lock); + if (bio_sync(bio)) + pending_bios = &device->pending_sync_bios; + else + pending_bios = &device->pending_bios; - if (device->pending_bio_tail) - device->pending_bio_tail->bi_next = bio; + if (pending_bios->tail) + pending_bios->tail->bi_next = bio; - device->pending_bio_tail = bio; - if (!device->pending_bios) - device->pending_bios = bio; + pending_bios->tail = bio; + if (!pending_bios->head) + pending_bios->head = bio; if (device->running_pending) should_queue = 0; @@ -3006,7 +3088,8 @@ static int fill_device_from_item(struct extent_buffer *leaf, unsigned long ptr; device->devid = btrfs_device_id(leaf, dev_item); - device->total_bytes = btrfs_device_total_bytes(leaf, dev_item); + device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); + device->total_bytes = device->disk_total_bytes; device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); device->type = btrfs_device_type(leaf, dev_item); device->io_align = btrfs_device_io_align(leaf, dev_item); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 2185de7..5c3ff6d 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -23,13 +23,22 @@ #include "async-thread.h" struct buffer_head; +struct btrfs_pending_bios { + struct bio *head; + struct bio *tail; +}; + struct btrfs_device { struct list_head dev_list; struct list_head dev_alloc_list; struct btrfs_fs_devices *fs_devices; struct btrfs_root *dev_root; - struct bio *pending_bios; - struct bio *pending_bio_tail; + + /* regular prio bios */ + struct btrfs_pending_bios pending_bios; + /* WRITE_SYNC bios */ + struct btrfs_pending_bios pending_sync_bios; + int running_pending; u64 generation; @@ -52,6 +61,9 @@ struct btrfs_device { /* size of the device */ u64 total_bytes; + /* size of the disk */ + u64 disk_total_bytes; + /* bytes used */ u64 bytes_used; diff --git a/fs/compat.c b/fs/compat.c index 3f84d5f..681ed81 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -181,22 +181,24 @@ asmlinkage long compat_sys_newstat(char __user * filename, struct compat_stat __user *statbuf) { struct kstat stat; - int error = vfs_stat_fd(AT_FDCWD, filename, &stat); + int error; - if (!error) - error = cp_compat_stat(&stat, statbuf); - return error; + error = vfs_stat(filename, &stat); + if (error) + return error; + return cp_compat_stat(&stat, statbuf); } asmlinkage long compat_sys_newlstat(char __user * filename, struct compat_stat __user *statbuf) { struct kstat stat; - int error = vfs_lstat_fd(AT_FDCWD, filename, &stat); + int error; - if (!error) - error = cp_compat_stat(&stat, statbuf); - return error; + error = vfs_lstat(filename, &stat); + if (error) + return error; + return cp_compat_stat(&stat, statbuf); } #ifndef __ARCH_WANT_STAT64 @@ -204,21 +206,12 @@ asmlinkage long compat_sys_newfstatat(unsigned int dfd, char __user *filename, struct compat_stat __user *statbuf, int flag) { struct kstat stat; - int error = -EINVAL; - - if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0) - goto out; - - if (flag & AT_SYMLINK_NOFOLLOW) - error = vfs_lstat_fd(dfd, filename, &stat); - else - error = vfs_stat_fd(dfd, filename, &stat); - - if (!error) - error = cp_compat_stat(&stat, statbuf); + int error; -out: - return error; + error = vfs_fstatat(dfd, filename, &stat, flag); + if (error) + return error; + return cp_compat_stat(&stat, statbuf); } #endif @@ -1483,6 +1476,7 @@ int compat_do_execve(char * filename, struct linux_binprm *bprm; struct file *file; struct files_struct *displaced; + bool clear_in_exec; int retval; retval = unshare_files(&displaced); @@ -1505,8 +1499,9 @@ int compat_do_execve(char * filename, goto out_unlock; retval = check_unsafe_exec(bprm); - if (retval) + if (retval < 0) goto out_unlock; + clear_in_exec = retval; file = open_exec(filename); retval = PTR_ERR(file); @@ -1553,9 +1548,7 @@ int compat_do_execve(char * filename, goto out; /* execve succeeded */ - write_lock(¤t->fs->lock); current->fs->in_exec = 0; - write_unlock(¤t->fs->lock); current->in_execve = 0; mutex_unlock(¤t->cred_exec_mutex); acct_update_integrals(current); @@ -1575,9 +1568,8 @@ out_file: } out_unmark: - write_lock(¤t->fs->lock); - current->fs->in_exec = 0; - write_unlock(¤t->fs->lock); + if (clear_in_exec) + current->fs->in_exec = 0; out_unlock: current->in_execve = 0; diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 3e87ce4..b83f6bc 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -58,7 +58,6 @@ #include <linux/i2c.h> #include <linux/i2c-dev.h> #include <linux/atalk.h> -#include <linux/loop.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci.h> @@ -68,6 +67,7 @@ #include <linux/gigaset_dev.h> #ifdef CONFIG_BLOCK +#include <linux/loop.h> #include <scsi/scsi.h> #include <scsi/scsi_ioctl.h> #include <scsi/sg.h> @@ -2660,6 +2660,8 @@ HANDLE_IOCTL(SONET_GETFRAMING, do_atm_ioctl) HANDLE_IOCTL(SONET_GETFRSENSE, do_atm_ioctl) /* block stuff */ #ifdef CONFIG_BLOCK +/* loop */ +IGNORE_IOCTL(LOOP_CLR_FD) /* Raw devices */ HANDLE_IOCTL(RAW_SETBIND, raw_ioctl) HANDLE_IOCTL(RAW_GETBIND, raw_ioctl) @@ -2728,9 +2730,6 @@ HANDLE_IOCTL(LPSETTIMEOUT, lp_timeout_trans) IGNORE_IOCTL(VFAT_IOCTL_READDIR_BOTH32) IGNORE_IOCTL(VFAT_IOCTL_READDIR_SHORT32) -/* loop */ -IGNORE_IOCTL(LOOP_CLR_FD) - #ifdef CONFIG_SPARC /* Sparc framebuffers, handled in sbusfb_compat_ioctl() */ IGNORE_IOCTL(FBIOGTYPE) diff --git a/fs/dcache.c b/fs/dcache.c index 761d30b..1fcffeb 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2149,7 +2149,6 @@ int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry) int result; unsigned long seq; - /* FIXME: This is old behavior, needed? Please check callers. */ if (new_dentry == old_dentry) return 1; diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 8b65f28..b91851f 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -483,15 +483,7 @@ int ecryptfs_encrypt_page(struct page *page) ecryptfs_inode = page->mapping->host; crypt_stat = &(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat); - if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { - rc = ecryptfs_write_lower_page_segment(ecryptfs_inode, page, - 0, PAGE_CACHE_SIZE); - if (rc) - printk(KERN_ERR "%s: Error attempting to copy " - "page at index [%ld]\n", __func__, - page->index); - goto out; - } + BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)); enc_extent_page = alloc_page(GFP_USER); if (!enc_extent_page) { rc = -ENOMEM; @@ -620,16 +612,7 @@ int ecryptfs_decrypt_page(struct page *page) ecryptfs_inode = page->mapping->host; crypt_stat = &(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat); - if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { - rc = ecryptfs_read_lower_page_segment(page, page->index, 0, - PAGE_CACHE_SIZE, - ecryptfs_inode); - if (rc) - printk(KERN_ERR "%s: Error attempting to copy " - "page at index [%ld]\n", __func__, - page->index); - goto out; - } + BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)); enc_extent_page = alloc_page(GFP_USER); if (!enc_extent_page) { rc = -ENOMEM; diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 064c582..00b30a2 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -269,6 +269,7 @@ struct ecryptfs_crypt_stat { #define ECRYPTFS_ENCRYPT_FILENAMES 0x00000800 #define ECRYPTFS_ENCFN_USE_MOUNT_FNEK 0x00001000 #define ECRYPTFS_ENCFN_USE_FEK 0x00002000 +#define ECRYPTFS_UNLINK_SIGS 0x00004000 u32 flags; unsigned int file_version; size_t iv_bytes; diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 55b3145..2f0945d 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -379,9 +379,11 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, goto out_d_drop; } lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent); + mutex_lock(&lower_dir_dentry->d_inode->i_mutex); lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name, lower_dir_dentry, ecryptfs_dentry->d_name.len); + mutex_unlock(&lower_dir_dentry->d_inode->i_mutex); if (IS_ERR(lower_dentry)) { rc = PTR_ERR(lower_dentry); printk(KERN_ERR "%s: lookup_one_len() returned [%d] on " @@ -406,9 +408,11 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, "filename; rc = [%d]\n", __func__, rc); goto out_d_drop; } + mutex_lock(&lower_dir_dentry->d_inode->i_mutex); lower_dentry = lookup_one_len(encrypted_and_encoded_name, lower_dir_dentry, encrypted_and_encoded_name_size - 1); + mutex_unlock(&lower_dir_dentry->d_inode->i_mutex); if (IS_ERR(lower_dentry)) { rc = PTR_ERR(lower_dentry); printk(KERN_ERR "%s: lookup_one_len() returned [%d] on " @@ -636,8 +640,9 @@ static int ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz) { char *lower_buf; + size_t lower_bufsiz; struct dentry *lower_dentry; - struct ecryptfs_crypt_stat *crypt_stat; + struct ecryptfs_mount_crypt_stat *mount_crypt_stat; char *plaintext_name; size_t plaintext_name_size; mm_segment_t old_fs; @@ -648,12 +653,21 @@ ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz) rc = -EINVAL; goto out; } - crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat; + mount_crypt_stat = &ecryptfs_superblock_to_private( + dentry->d_sb)->mount_crypt_stat; + /* + * If the lower filename is encrypted, it will result in a significantly + * longer name. If needed, truncate the name after decode and decrypt. + */ + if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) + lower_bufsiz = PATH_MAX; + else + lower_bufsiz = bufsiz; /* Released in this function */ - lower_buf = kmalloc(bufsiz, GFP_KERNEL); + lower_buf = kmalloc(lower_bufsiz, GFP_KERNEL); if (lower_buf == NULL) { printk(KERN_ERR "%s: Out of memory whilst attempting to " - "kmalloc [%d] bytes\n", __func__, bufsiz); + "kmalloc [%zd] bytes\n", __func__, lower_bufsiz); rc = -ENOMEM; goto out; } @@ -661,7 +675,7 @@ ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz) set_fs(get_ds()); rc = lower_dentry->d_inode->i_op->readlink(lower_dentry, (char __user *)lower_buf, - bufsiz); + lower_bufsiz); set_fs(old_fs); if (rc >= 0) { rc = ecryptfs_decode_and_decrypt_filename(&plaintext_name, @@ -674,7 +688,9 @@ ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz) rc); goto out_free_lower_buf; } - rc = copy_to_user(buf, plaintext_name, plaintext_name_size); + /* Check for bufsiz <= 0 done in sys_readlinkat() */ + rc = copy_to_user(buf, plaintext_name, + min((size_t) bufsiz, plaintext_name_size)); if (rc) rc = -EFAULT; else @@ -814,6 +830,13 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length) size_t num_zeros = (PAGE_CACHE_SIZE - (new_length & ~PAGE_CACHE_MASK)); + if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { + rc = vmtruncate(inode, new_length); + if (rc) + goto out_free; + rc = vmtruncate(lower_dentry->d_inode, new_length); + goto out_free; + } if (num_zeros) { char *zeros_virt; @@ -915,8 +938,6 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia) } rc = 0; crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED); - mutex_unlock(&crypt_stat->cs_mutex); - goto out; } } mutex_unlock(&crypt_stat->cs_mutex); diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index aed56c2..ccabd5f 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -190,14 +190,14 @@ int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry, init_special_inode(inode, lower_inode->i_mode, lower_inode->i_rdev); dentry->d_op = &ecryptfs_dops; - if (flags & ECRYPTFS_INTERPOSE_FLAG_D_ADD) - d_add(dentry, inode); - else - d_instantiate(dentry, inode); fsstack_copy_attr_all(inode, lower_inode, NULL); /* This size will be overwritten for real files w/ headers and * other metadata */ fsstack_copy_inode_size(inode, lower_inode); + if (flags & ECRYPTFS_INTERPOSE_FLAG_D_ADD) + d_add(dentry, inode); + else + d_instantiate(dentry, inode); out: return rc; } @@ -208,7 +208,7 @@ enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig, ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata, ecryptfs_opt_encrypted_view, ecryptfs_opt_fnek_sig, ecryptfs_opt_fn_cipher, ecryptfs_opt_fn_cipher_key_bytes, - ecryptfs_opt_err }; + ecryptfs_opt_unlink_sigs, ecryptfs_opt_err }; static const match_table_t tokens = { {ecryptfs_opt_sig, "sig=%s"}, @@ -222,6 +222,7 @@ static const match_table_t tokens = { {ecryptfs_opt_fnek_sig, "ecryptfs_fnek_sig=%s"}, {ecryptfs_opt_fn_cipher, "ecryptfs_fn_cipher=%s"}, {ecryptfs_opt_fn_cipher_key_bytes, "ecryptfs_fn_key_bytes=%u"}, + {ecryptfs_opt_unlink_sigs, "ecryptfs_unlink_sigs"}, {ecryptfs_opt_err, NULL} }; @@ -402,6 +403,9 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options) fn_cipher_key_bytes; fn_cipher_key_bytes_set = 1; break; + case ecryptfs_opt_unlink_sigs: + mount_crypt_stat->flags |= ECRYPTFS_UNLINK_SIGS; + break; case ecryptfs_opt_err: default: printk(KERN_WARNING diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c index 295e7fa5..f1c17e8 100644 --- a/fs/ecryptfs/messaging.c +++ b/fs/ecryptfs/messaging.c @@ -133,45 +133,6 @@ out: return rc; } -static int -ecryptfs_send_message_locked(char *data, int data_len, u8 msg_type, - struct ecryptfs_msg_ctx **msg_ctx); - -/** - * ecryptfs_send_raw_message - * @msg_type: Message type - * @daemon: Daemon struct for recipient of message - * - * A raw message is one that does not include an ecryptfs_message - * struct. It simply has a type. - * - * Must be called with ecryptfs_daemon_hash_mux held. - * - * Returns zero on success; non-zero otherwise - */ -static int ecryptfs_send_raw_message(u8 msg_type, - struct ecryptfs_daemon *daemon) -{ - struct ecryptfs_msg_ctx *msg_ctx; - int rc; - - rc = ecryptfs_send_message_locked(NULL, 0, msg_type, &msg_ctx); - if (rc) { - printk(KERN_ERR "%s: Error whilst attempting to send " - "message to ecryptfsd; rc = [%d]\n", __func__, rc); - goto out; - } - /* Raw messages are logically context-free (e.g., no - * reply is expected), so we set the state of the - * ecryptfs_msg_ctx object to indicate that it should - * be freed as soon as the message is sent. */ - mutex_lock(&msg_ctx->mux); - msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_NO_REPLY; - mutex_unlock(&msg_ctx->mux); -out: - return rc; -} - /** * ecryptfs_spawn_daemon - Create and initialize a new daemon struct * @daemon: Pointer to set to newly allocated daemon struct @@ -212,49 +173,6 @@ out: } /** - * ecryptfs_process_helo - * @euid: The user ID owner of the message - * @user_ns: The namespace in which @euid applies - * @pid: The process ID for the userspace program that sent the - * message - * - * Adds the euid and pid values to the daemon euid hash. If an euid - * already has a daemon pid registered, the daemon will be - * unregistered before the new daemon is put into the hash list. - * Returns zero after adding a new daemon to the hash list; - * non-zero otherwise. - */ -int ecryptfs_process_helo(uid_t euid, struct user_namespace *user_ns, - struct pid *pid) -{ - struct ecryptfs_daemon *new_daemon; - struct ecryptfs_daemon *old_daemon; - int rc; - - mutex_lock(&ecryptfs_daemon_hash_mux); - rc = ecryptfs_find_daemon_by_euid(&old_daemon, euid, user_ns); - if (rc != 0) { - printk(KERN_WARNING "Received request from user [%d] " - "to register daemon [0x%p]; unregistering daemon " - "[0x%p]\n", euid, pid, old_daemon->pid); - rc = ecryptfs_send_raw_message(ECRYPTFS_MSG_QUIT, old_daemon); - if (rc) - printk(KERN_WARNING "Failed to send QUIT " - "message to daemon [0x%p]; rc = [%d]\n", - old_daemon->pid, rc); - hlist_del(&old_daemon->euid_chain); - kfree(old_daemon); - } - rc = ecryptfs_spawn_daemon(&new_daemon, euid, user_ns, pid); - if (rc) - printk(KERN_ERR "%s: The gods are displeased with this attempt " - "to create a new daemon object for euid [%d]; pid " - "[0x%p]; rc = [%d]\n", __func__, euid, pid, rc); - mutex_unlock(&ecryptfs_daemon_hash_mux); - return rc; -} - -/** * ecryptfs_exorcise_daemon - Destroy the daemon struct * * Must be called ceremoniously while in possession of diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c index a67fea6..4ec8f61 100644 --- a/fs/ecryptfs/miscdev.c +++ b/fs/ecryptfs/miscdev.c @@ -193,26 +193,20 @@ int ecryptfs_send_miscdev(char *data, size_t data_size, int rc = 0; mutex_lock(&msg_ctx->mux); - if (data) { - msg_ctx->msg = kmalloc((sizeof(*msg_ctx->msg) + data_size), - GFP_KERNEL); - if (!msg_ctx->msg) { - rc = -ENOMEM; - printk(KERN_ERR "%s: Out of memory whilst attempting " - "to kmalloc(%zd, GFP_KERNEL)\n", __func__, - (sizeof(*msg_ctx->msg) + data_size)); - goto out_unlock; - } - } else - msg_ctx->msg = NULL; + msg_ctx->msg = kmalloc((sizeof(*msg_ctx->msg) + data_size), + GFP_KERNEL); + if (!msg_ctx->msg) { + rc = -ENOMEM; + printk(KERN_ERR "%s: Out of memory whilst attempting " + "to kmalloc(%zd, GFP_KERNEL)\n", __func__, + (sizeof(*msg_ctx->msg) + data_size)); + goto out_unlock; + } msg_ctx->msg->index = msg_ctx->index; msg_ctx->msg->data_len = data_size; msg_ctx->type = msg_type; - if (data) { - memcpy(msg_ctx->msg->data, data, data_size); - msg_ctx->msg_size = (sizeof(*msg_ctx->msg) + data_size); - } else - msg_ctx->msg_size = 0; + memcpy(msg_ctx->msg->data, data, data_size); + msg_ctx->msg_size = (sizeof(*msg_ctx->msg) + data_size); mutex_lock(&daemon->mux); list_add_tail(&msg_ctx->daemon_out_list, &daemon->msg_ctx_out_queue); daemon->num_queued_msg_ctx++; @@ -418,18 +412,13 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf, if (count == 0) goto out; - data = kmalloc(count, GFP_KERNEL); - if (!data) { - printk(KERN_ERR "%s: Out of memory whilst attempting to " - "kmalloc([%zd], GFP_KERNEL)\n", __func__, count); + + data = memdup_user(buf, count); + if (IS_ERR(data)) { + printk(KERN_ERR "%s: memdup_user returned error [%ld]\n", + __func__, PTR_ERR(data)); goto out; } - rc = copy_from_user(data, buf, count); - if (rc) { - printk(KERN_ERR "%s: copy_from_user returned error [%d]\n", - __func__, rc); - goto out_free; - } sz = count; i = 0; switch (data[i++]) { diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 46cec2b..5c6bab9 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -449,6 +449,7 @@ int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode) struct ecryptfs_crypt_stat *crypt_stat; crypt_stat = &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat; + BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)); if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) return ecryptfs_write_inode_size_to_xattr(ecryptfs_inode); else @@ -490,6 +491,16 @@ static int ecryptfs_write_end(struct file *file, ecryptfs_printk(KERN_DEBUG, "Not a new file\n"); ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page" "(page w/ index = [0x%.16x], to = [%d])\n", index, to); + if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { + rc = ecryptfs_write_lower_page_segment(ecryptfs_inode, page, 0, + to); + if (!rc) { + rc = copied; + fsstack_copy_inode_size(ecryptfs_inode, + ecryptfs_inode_to_lower(ecryptfs_inode)); + } + goto out; + } /* Fills in zeros if 'to' goes beyond inode size */ rc = fill_zeros_to_end_of_page(page, to); if (rc) { diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c index 75c2ea9..a137c6e 100644 --- a/fs/ecryptfs/read_write.c +++ b/fs/ecryptfs/read_write.c @@ -117,13 +117,15 @@ int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset, size_t size) { struct page *ecryptfs_page; + struct ecryptfs_crypt_stat *crypt_stat; + struct inode *ecryptfs_inode = ecryptfs_file->f_dentry->d_inode; char *ecryptfs_page_virt; - loff_t ecryptfs_file_size = - i_size_read(ecryptfs_file->f_dentry->d_inode); + loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode); loff_t data_offset = 0; loff_t pos; int rc = 0; + crypt_stat = &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat; /* * if we are writing beyond current size, then start pos * at the current size - we'll fill in zeros from there. @@ -184,7 +186,13 @@ int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset, flush_dcache_page(ecryptfs_page); SetPageUptodate(ecryptfs_page); unlock_page(ecryptfs_page); - rc = ecryptfs_encrypt_page(ecryptfs_page); + if (crypt_stat->flags & ECRYPTFS_ENCRYPTED) + rc = ecryptfs_encrypt_page(ecryptfs_page); + else + rc = ecryptfs_write_lower_page_segment(ecryptfs_inode, + ecryptfs_page, + start_offset_in_page, + data_offset); page_cache_release(ecryptfs_page); if (rc) { printk(KERN_ERR "%s: Error encrypting " @@ -194,14 +202,16 @@ int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset, pos += num_bytes; } if ((offset + size) > ecryptfs_file_size) { - i_size_write(ecryptfs_file->f_dentry->d_inode, (offset + size)); - rc = ecryptfs_write_inode_size_to_metadata( - ecryptfs_file->f_dentry->d_inode); - if (rc) { - printk(KERN_ERR "Problem with " - "ecryptfs_write_inode_size_to_metadata; " - "rc = [%d]\n", rc); - goto out; + i_size_write(ecryptfs_inode, (offset + size)); + if (crypt_stat->flags & ECRYPTFS_ENCRYPTED) { + rc = ecryptfs_write_inode_size_to_metadata( + ecryptfs_inode); + if (rc) { + printk(KERN_ERR "Problem with " + "ecryptfs_write_inode_size_to_metadata; " + "rc = [%d]\n", rc); + goto out; + } } } out: diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c index c27ac2b..fa4c7e7 100644 --- a/fs/ecryptfs/super.c +++ b/fs/ecryptfs/super.c @@ -170,7 +170,10 @@ static int ecryptfs_show_options(struct seq_file *m, struct vfsmount *mnt) list_for_each_entry(walker, &mount_crypt_stat->global_auth_tok_list, mount_crypt_stat_list) { - seq_printf(m, ",ecryptfs_sig=%s", walker->sig); + if (walker->flags & ECRYPTFS_AUTH_TOK_FNEK) + seq_printf(m, ",ecryptfs_fnek_sig=%s", walker->sig); + else + seq_printf(m, ",ecryptfs_sig=%s", walker->sig); } mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex); @@ -186,6 +189,8 @@ static int ecryptfs_show_options(struct seq_file *m, struct vfsmount *mnt) seq_printf(m, ",ecryptfs_xattr_metadata"); if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) seq_printf(m, ",ecryptfs_encrypted_view"); + if (mount_crypt_stat->flags & ECRYPTFS_UNLINK_SIGS) + seq_printf(m, ",ecryptfs_unlink_sigs"); return 0; } @@ -1060,7 +1060,6 @@ EXPORT_SYMBOL(install_exec_creds); int check_unsafe_exec(struct linux_binprm *bprm) { struct task_struct *p = current, *t; - unsigned long flags; unsigned n_fs; int res = 0; @@ -1068,21 +1067,22 @@ int check_unsafe_exec(struct linux_binprm *bprm) n_fs = 1; write_lock(&p->fs->lock); - lock_task_sighand(p, &flags); + rcu_read_lock(); for (t = next_thread(p); t != p; t = next_thread(t)) { if (t->fs == p->fs) n_fs++; } + rcu_read_unlock(); if (p->fs->users > n_fs) { bprm->unsafe |= LSM_UNSAFE_SHARE; } else { - if (p->fs->in_exec) - res = -EAGAIN; - p->fs->in_exec = 1; + res = -EAGAIN; + if (!p->fs->in_exec) { + p->fs->in_exec = 1; + res = 1; + } } - - unlock_task_sighand(p, &flags); write_unlock(&p->fs->lock); return res; @@ -1284,6 +1284,7 @@ int do_execve(char * filename, struct linux_binprm *bprm; struct file *file; struct files_struct *displaced; + bool clear_in_exec; int retval; retval = unshare_files(&displaced); @@ -1306,8 +1307,9 @@ int do_execve(char * filename, goto out_unlock; retval = check_unsafe_exec(bprm); - if (retval) + if (retval < 0) goto out_unlock; + clear_in_exec = retval; file = open_exec(filename); retval = PTR_ERR(file); @@ -1355,9 +1357,7 @@ int do_execve(char * filename, goto out; /* execve succeeded */ - write_lock(¤t->fs->lock); current->fs->in_exec = 0; - write_unlock(¤t->fs->lock); current->in_execve = 0; mutex_unlock(¤t->cred_exec_mutex); acct_update_integrals(current); @@ -1377,9 +1377,8 @@ out_file: } out_unmark: - write_lock(¤t->fs->lock); - current->fs->in_exec = 0; - write_unlock(¤t->fs->lock); + if (clear_in_exec) + current->fs->in_exec = 0; out_unlock: current->in_execve = 0; diff --git a/fs/ext2/super.c b/fs/ext2/super.c index f983225..5c4afe6 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1395,8 +1395,10 @@ static ssize_t ext2_quota_write(struct super_block *sb, int type, blk++; } out: - if (len == towrite) + if (len == towrite) { + mutex_unlock(&inode->i_mutex); return err; + } if (inode->i_size < off+len-towrite) i_size_write(inode, off+len-towrite); inode->i_version++; diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 2a1cb09..e403321 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -326,11 +326,14 @@ ext4_ext_max_entries(struct inode *inode, int depth) static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) { - ext4_fsblk_t block = ext_pblock(ext); + ext4_fsblk_t block = ext_pblock(ext), valid_block; int len = ext4_ext_get_actual_len(ext); struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; - if (unlikely(block < le32_to_cpu(es->s_first_data_block) || - ((block + len) > ext4_blocks_count(es)))) + + valid_block = le32_to_cpu(es->s_first_data_block) + + EXT4_SB(inode->i_sb)->s_gdb_count; + if (unlikely(block <= valid_block || + ((block + len) > ext4_blocks_count(es)))) return 0; else return 1; @@ -339,10 +342,13 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) static int ext4_valid_extent_idx(struct inode *inode, struct ext4_extent_idx *ext_idx) { - ext4_fsblk_t block = idx_pblock(ext_idx); + ext4_fsblk_t block = idx_pblock(ext_idx), valid_block; struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; - if (unlikely(block < le32_to_cpu(es->s_first_data_block) || - (block >= ext4_blocks_count(es)))) + + valid_block = le32_to_cpu(es->s_first_data_block) + + EXT4_SB(inode->i_sb)->s_gdb_count; + if (unlikely(block <= valid_block || + (block >= ext4_blocks_count(es)))) return 0; else return 1; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 47b84e8..f18e0a0 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -585,6 +585,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, fallback: ngroups = sbi->s_groups_count; avefreei = freei / ngroups; +fallback_retry: parent_group = EXT4_I(parent)->i_block_group; for (i = 0; i < ngroups; i++) { grp = (parent_group + i) % ngroups; @@ -602,7 +603,7 @@ fallback: * filesystems the above test can fail to find any blockgroups */ avefreei = 0; - goto fallback; + goto fallback_retry; } return -1; @@ -831,11 +832,12 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) ret2 = find_group_flex(sb, dir, &group); if (ret2 == -1) { ret2 = find_group_other(sb, dir, &group, mode); - if (ret2 == 0 && once) + if (ret2 == 0 && once) { once = 0; printk(KERN_NOTICE "ext4: find_group_flex " "failed, fallback succeeded dir %lu\n", dir->i_ino); + } } goto got_group; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c6bd6ce..e91f978 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4357,11 +4357,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ei->i_flags = le32_to_cpu(raw_inode->i_flags); inode->i_blocks = ext4_inode_blocks(raw_inode, ei); ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo); - if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != - cpu_to_le32(EXT4_OS_HURD)) { + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) ei->i_file_acl |= ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; - } inode->i_size = ext4_isize(raw_inode); ei->i_disksize = inode->i_size; inode->i_generation = le32_to_cpu(raw_inode->i_generation); @@ -4409,9 +4407,23 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; } - if (ei->i_flags & EXT4_EXTENTS_FL) { - /* Validate extent which is part of inode */ - ret = ext4_ext_check_inode(inode); + ret = 0; + if (ei->i_file_acl && + ((ei->i_file_acl < + (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) + + EXT4_SB(sb)->s_gdb_count)) || + (ei->i_file_acl >= ext4_blocks_count(EXT4_SB(sb)->s_es)))) { + ext4_error(sb, __func__, + "bad extended attribute block %llu in inode #%lu", + ei->i_file_acl, inode->i_ino); + ret = -EIO; + goto bad_inode; + } else if (ei->i_flags & EXT4_EXTENTS_FL) { + if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + (S_ISLNK(inode->i_mode) && + !ext4_inode_is_fast_symlink(inode))) + /* Validate extent which is part of inode */ + ret = ext4_ext_check_inode(inode); } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || (S_ISLNK(inode->i_mode) && !ext4_inode_is_fast_symlink(inode))) { diff --git a/fs/filesystems.c b/fs/filesystems.c index 1aa7026..a24c58e 100644 --- a/fs/filesystems.c +++ b/fs/filesystems.c @@ -199,7 +199,7 @@ SYSCALL_DEFINE3(sysfs, int, option, unsigned long, arg1, unsigned long, arg2) return retval; } -int get_filesystem_list(char * buf) +int __init get_filesystem_list(char *buf) { int len = 0; struct file_system_type * tmp; diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index bf23a62..70f87f4 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -156,6 +156,12 @@ static void inode_go_sync(struct gfs2_glock *gl) error = filemap_fdatawait(metamapping); mapping_set_error(metamapping, error); gfs2_ail_empty_gl(gl); + /* + * Writeback of the data mapping may cause the dirty flag to be set + * so we have to clear it again here. + */ + smp_mb__before_clear_bit(); + clear_bit(GLF_DIRTY, &gl->gl_flags); } /** diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c index 101caf3..5d82e91 100644 --- a/fs/gfs2/ops_file.c +++ b/fs/gfs2/ops_file.c @@ -413,7 +413,9 @@ out_unlock: gfs2_glock_dq(&gh); out: gfs2_holder_uninit(&gh); - if (ret) + if (ret == -ENOMEM) + ret = VM_FAULT_OOM; + else if (ret) ret = VM_FAULT_SIGBUS; return ret; } diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index f03d024..5650382 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -212,8 +212,7 @@ static u32 gfs2_bitfit(const u8 *buf, const unsigned int len, if (tmp == 0) return BFITNOENT; ptr--; - bit = fls64(tmp); - bit--; /* fls64 always adds one to the bit count */ + bit = __ffs64(tmp); bit /= 2; /* two bits per entry in the bitmap */ return (((const unsigned char *)ptr - buf) * GFS2_NBBY) + bit; } @@ -1445,10 +1444,12 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart, u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct buffer_head *dibh; struct gfs2_alloc *al = ip->i_alloc; struct gfs2_rgrpd *rgd = al->al_rgd; u32 goal, blk; u64 block; + int error; if (rgrp_contains_block(rgd, ip->i_goal)) goal = ip->i_goal - rgd->rd_data0; @@ -1461,7 +1462,13 @@ u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n) rgd->rd_last_alloc = blk; block = rgd->rd_data0 + blk; ip->i_goal = block; - + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error == 0) { + struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data; + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + di->di_goal_meta = di->di_goal_data = cpu_to_be64(ip->i_goal); + brelse(dibh); + } gfs2_assert_withdraw(sdp, rgd->rd_free >= *n); rgd->rd_free -= *n; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 23a3c76..153d968 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -26,7 +26,6 @@ #include <linux/pagevec.h> #include <linux/parser.h> #include <linux/mman.h> -#include <linux/quotaops.h> #include <linux/slab.h> #include <linux/dnotify.h> #include <linux/statfs.h> @@ -842,7 +841,7 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) bad_val: printk(KERN_ERR "hugetlbfs: Bad value '%s' for mount option '%s'\n", args[0].from, p); - return 1; + return -EINVAL; } static int diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index a8e8513..06560c5 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -502,7 +502,7 @@ void journal_commit_transaction(journal_t *journal) err = 0; } - journal_write_revoke_records(journal, commit_transaction); + journal_write_revoke_records(journal, commit_transaction, write_op); /* * If we found any dirty or locked buffers, then we should have diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c index 3e9afc2..da6cd9b 100644 --- a/fs/jbd/revoke.c +++ b/fs/jbd/revoke.c @@ -86,6 +86,7 @@ #include <linux/slab.h> #include <linux/list.h> #include <linux/init.h> +#include <linux/bio.h> #endif #include <linux/log2.h> @@ -118,8 +119,8 @@ struct jbd_revoke_table_s #ifdef __KERNEL__ static void write_one_revoke_record(journal_t *, transaction_t *, struct journal_head **, int *, - struct jbd_revoke_record_s *); -static void flush_descriptor(journal_t *, struct journal_head *, int); + struct jbd_revoke_record_s *, int); +static void flush_descriptor(journal_t *, struct journal_head *, int, int); #endif /* Utility functions to maintain the revoke table */ @@ -500,7 +501,7 @@ void journal_switch_revoke_table(journal_t *journal) * revoke hash, deleting the entries as we go. */ void journal_write_revoke_records(journal_t *journal, - transaction_t *transaction) + transaction_t *transaction, int write_op) { struct journal_head *descriptor; struct jbd_revoke_record_s *record; @@ -524,14 +525,14 @@ void journal_write_revoke_records(journal_t *journal, hash_list->next; write_one_revoke_record(journal, transaction, &descriptor, &offset, - record); + record, write_op); count++; list_del(&record->hash); kmem_cache_free(revoke_record_cache, record); } } if (descriptor) - flush_descriptor(journal, descriptor, offset); + flush_descriptor(journal, descriptor, offset, write_op); jbd_debug(1, "Wrote %d revoke records\n", count); } @@ -544,7 +545,8 @@ static void write_one_revoke_record(journal_t *journal, transaction_t *transaction, struct journal_head **descriptorp, int *offsetp, - struct jbd_revoke_record_s *record) + struct jbd_revoke_record_s *record, + int write_op) { struct journal_head *descriptor; int offset; @@ -563,7 +565,7 @@ static void write_one_revoke_record(journal_t *journal, /* Make sure we have a descriptor with space left for the record */ if (descriptor) { if (offset == journal->j_blocksize) { - flush_descriptor(journal, descriptor, offset); + flush_descriptor(journal, descriptor, offset, write_op); descriptor = NULL; } } @@ -600,7 +602,7 @@ static void write_one_revoke_record(journal_t *journal, static void flush_descriptor(journal_t *journal, struct journal_head *descriptor, - int offset) + int offset, int write_op) { journal_revoke_header_t *header; struct buffer_head *bh = jh2bh(descriptor); @@ -615,7 +617,7 @@ static void flush_descriptor(journal_t *journal, set_buffer_jwrite(bh); BUFFER_TRACE(bh, "write"); set_buffer_dirty(bh); - ll_rw_block(SWRITE, 1, &bh); + ll_rw_block((write_op == WRITE) ? SWRITE : SWRITE_SYNC_PLUG, 1, &bh); } #endif diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 073c8c3..0b7d3b8 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -506,7 +506,8 @@ void jbd2_journal_commit_transaction(journal_t *journal) if (err) jbd2_journal_abort(journal, err); - jbd2_journal_write_revoke_records(journal, commit_transaction); + jbd2_journal_write_revoke_records(journal, commit_transaction, + write_op); jbd_debug(3, "JBD: commit phase 2\n"); diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index bbe6d59..a360b06 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -86,6 +86,7 @@ #include <linux/slab.h> #include <linux/list.h> #include <linux/init.h> +#include <linux/bio.h> #endif #include <linux/log2.h> @@ -118,8 +119,8 @@ struct jbd2_revoke_table_s #ifdef __KERNEL__ static void write_one_revoke_record(journal_t *, transaction_t *, struct journal_head **, int *, - struct jbd2_revoke_record_s *); -static void flush_descriptor(journal_t *, struct journal_head *, int); + struct jbd2_revoke_record_s *, int); +static void flush_descriptor(journal_t *, struct journal_head *, int, int); #endif /* Utility functions to maintain the revoke table */ @@ -499,7 +500,8 @@ void jbd2_journal_switch_revoke_table(journal_t *journal) * revoke hash, deleting the entries as we go. */ void jbd2_journal_write_revoke_records(journal_t *journal, - transaction_t *transaction) + transaction_t *transaction, + int write_op) { struct journal_head *descriptor; struct jbd2_revoke_record_s *record; @@ -523,14 +525,14 @@ void jbd2_journal_write_revoke_records(journal_t *journal, hash_list->next; write_one_revoke_record(journal, transaction, &descriptor, &offset, - record); + record, write_op); count++; list_del(&record->hash); kmem_cache_free(jbd2_revoke_record_cache, record); } } if (descriptor) - flush_descriptor(journal, descriptor, offset); + flush_descriptor(journal, descriptor, offset, write_op); jbd_debug(1, "Wrote %d revoke records\n", count); } @@ -543,7 +545,8 @@ static void write_one_revoke_record(journal_t *journal, transaction_t *transaction, struct journal_head **descriptorp, int *offsetp, - struct jbd2_revoke_record_s *record) + struct jbd2_revoke_record_s *record, + int write_op) { struct journal_head *descriptor; int offset; @@ -562,7 +565,7 @@ static void write_one_revoke_record(journal_t *journal, /* Make sure we have a descriptor with space left for the record */ if (descriptor) { if (offset == journal->j_blocksize) { - flush_descriptor(journal, descriptor, offset); + flush_descriptor(journal, descriptor, offset, write_op); descriptor = NULL; } } @@ -607,7 +610,7 @@ static void write_one_revoke_record(journal_t *journal, static void flush_descriptor(journal_t *journal, struct journal_head *descriptor, - int offset) + int offset, int write_op) { jbd2_journal_revoke_header_t *header; struct buffer_head *bh = jh2bh(descriptor); @@ -622,7 +625,7 @@ static void flush_descriptor(journal_t *journal, set_buffer_jwrite(bh); BUFFER_TRACE(bh, "write"); set_buffer_dirty(bh); - ll_rw_block(SWRITE, 1, &bh); + ll_rw_block((write_op == WRITE) ? SWRITE : SWRITE_SYNC_PLUG, 1, &bh); } #endif @@ -1248,6 +1248,8 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) int err; struct qstr this; + WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex)); + err = __lookup_one_len(name, &this, base, len); if (err) return ERR_PTR(err); diff --git a/fs/namespace.c b/fs/namespace.c index d9138f8..4119620 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1377,7 +1377,7 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt, if (parent_path) { detach_mnt(source_mnt, parent_path); attach_mnt(source_mnt, path); - touch_mnt_namespace(current->nsproxy->mnt_ns); + touch_mnt_namespace(parent_path->mnt->mnt_ns); } else { mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt); commit_tree(source_mnt); diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c index f54360f..fa038df 100644 --- a/fs/ncpfs/ioctl.c +++ b/fs/ncpfs/ioctl.c @@ -660,13 +660,10 @@ outrel: if (user.object_name_len > NCP_OBJECT_NAME_MAX_LEN) return -ENOMEM; if (user.object_name_len) { - newname = kmalloc(user.object_name_len, GFP_USER); - if (!newname) - return -ENOMEM; - if (copy_from_user(newname, user.object_name, user.object_name_len)) { - kfree(newname); - return -EFAULT; - } + newname = memdup_user(user.object_name, + user.object_name_len); + if (IS_ERR(newname)) + return PTR_ERR(newname); } else { newname = NULL; } @@ -760,13 +757,9 @@ outrel: if (user.len > NCP_PRIVATE_DATA_MAX_LEN) return -ENOMEM; if (user.len) { - new = kmalloc(user.len, GFP_USER); - if (!new) - return -ENOMEM; - if (copy_from_user(new, user.data, user.len)) { - kfree(new); - return -EFAULT; - } + new = memdup_user(user.data, user.len); + if (IS_ERR(new)) + return PTR_ERR(new); } else { new = NULL; } diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index e6a1932..35869a4 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -713,7 +713,8 @@ nfs3_xdr_setaclargs(struct rpc_rqst *req, __be32 *p, if (args->npages != 0) xdr_encode_pages(buf, args->pages, 0, args->len); else - req->rq_slen += args->len; + req->rq_slen = xdr_adjust_iovec(req->rq_svec, + p + XDR_QUADLEN(args->len)); err = nfsacl_encode(buf, base, args->inode, (args->mask & NFS_ACL) ? diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 3444c00..5275097 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -229,21 +229,23 @@ nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f) goto out; status = vfs_readdir(filp, nfsd4_build_namelist, &names); fput(filp); + mutex_lock(&dir->d_inode->i_mutex); while (!list_empty(&names)) { entry = list_entry(names.next, struct name_list, list); dentry = lookup_one_len(entry->name, dir, HEXDIR_LEN-1); if (IS_ERR(dentry)) { status = PTR_ERR(dentry); - goto out; + break; } status = f(dir, dentry); dput(dentry); if (status) - goto out; + break; list_del(&entry->list); kfree(entry); } + mutex_unlock(&dir->d_inode->i_mutex); out: while (!list_empty(&names)) { entry = list_entry(names.next, struct name_list, list); @@ -255,36 +257,6 @@ out: } static int -nfsd4_remove_clid_file(struct dentry *dir, struct dentry *dentry) -{ - int status; - - if (!S_ISREG(dir->d_inode->i_mode)) { - printk("nfsd4: non-file found in client recovery directory\n"); - return -EINVAL; - } - mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); - status = vfs_unlink(dir->d_inode, dentry); - mutex_unlock(&dir->d_inode->i_mutex); - return status; -} - -static int -nfsd4_clear_clid_dir(struct dentry *dir, struct dentry *dentry) -{ - int status; - - /* For now this directory should already be empty, but we empty it of - * any regular files anyway, just in case the directory was created by - * a kernel from the future.... */ - nfsd4_list_rec_dir(dentry, nfsd4_remove_clid_file); - mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); - status = vfs_rmdir(dir->d_inode, dentry); - mutex_unlock(&dir->d_inode->i_mutex); - return status; -} - -static int nfsd4_unlink_clid_dir(char *name, int namlen) { struct dentry *dentry; @@ -294,18 +266,18 @@ nfsd4_unlink_clid_dir(char *name, int namlen) mutex_lock(&rec_dir.dentry->d_inode->i_mutex); dentry = lookup_one_len(name, rec_dir.dentry, namlen); - mutex_unlock(&rec_dir.dentry->d_inode->i_mutex); if (IS_ERR(dentry)) { status = PTR_ERR(dentry); - return status; + goto out_unlock; } status = -ENOENT; if (!dentry->d_inode) goto out; - - status = nfsd4_clear_clid_dir(rec_dir.dentry, dentry); + status = vfs_rmdir(rec_dir.dentry->d_inode, dentry); out: dput(dentry); +out_unlock: + mutex_unlock(&rec_dir.dentry->d_inode->i_mutex); return status; } @@ -348,7 +320,7 @@ purge_old(struct dentry *parent, struct dentry *child) if (nfs4_has_reclaimed_state(child->d_name.name, false)) return 0; - status = nfsd4_clear_clid_dir(parent, child); + status = vfs_rmdir(parent->d_inode, child); if (status) printk("failed to remove client recovery directory %s\n", child->d_name.name); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index ab93fcf..6c68ffd 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -116,10 +116,15 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, } if ((exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2)) { /* successfully crossed mount point */ - exp_put(exp); - *expp = exp2; + /* + * This is subtle: dentry is *not* under mnt at this point. + * The only reason we are safe is that original mnt is pinned + * down by exp, so we should dput before putting exp. + */ dput(dentry); *dpp = mounts; + exp_put(exp); + *expp = exp2; } else { exp_put(exp2); dput(mounts); @@ -1885,8 +1890,8 @@ static int nfsd_buffered_filldir(void *__buf, const char *name, int namlen, return 0; } -static int nfsd_buffered_readdir(struct file *file, filldir_t func, - struct readdir_cd *cdp, loff_t *offsetp) +static __be32 nfsd_buffered_readdir(struct file *file, filldir_t func, + struct readdir_cd *cdp, loff_t *offsetp) { struct readdir_data buf; struct buffered_dirent *de; @@ -1896,11 +1901,12 @@ static int nfsd_buffered_readdir(struct file *file, filldir_t func, buf.dirent = (void *)__get_free_page(GFP_KERNEL); if (!buf.dirent) - return -ENOMEM; + return nfserrno(-ENOMEM); offset = *offsetp; while (1) { + struct inode *dir_inode = file->f_path.dentry->d_inode; unsigned int reclen; cdp->err = nfserr_eof; /* will be cleared on successful read */ @@ -1919,26 +1925,38 @@ static int nfsd_buffered_readdir(struct file *file, filldir_t func, if (!size) break; + /* + * Various filldir functions may end up calling back into + * lookup_one_len() and the file system's ->lookup() method. + * These expect i_mutex to be held, as it would within readdir. + */ + host_err = mutex_lock_killable(&dir_inode->i_mutex); + if (host_err) + break; + de = (struct buffered_dirent *)buf.dirent; while (size > 0) { offset = de->offset; if (func(cdp, de->name, de->namlen, de->offset, de->ino, de->d_type)) - goto done; + break; if (cdp->err != nfs_ok) - goto done; + break; reclen = ALIGN(sizeof(*de) + de->namlen, sizeof(u64)); size -= reclen; de = (struct buffered_dirent *)((char *)de + reclen); } + mutex_unlock(&dir_inode->i_mutex); + if (size > 0) /* We bailed out early */ + break; + offset = vfs_llseek(file, 0, SEEK_CUR); } - done: free_page((unsigned long)(buf.dirent)); if (host_err) diff --git a/fs/proc/stat.c b/fs/proc/stat.c index f75efa2..81e4eb6 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -18,6 +18,9 @@ #ifndef arch_irq_stat #define arch_irq_stat() 0 #endif +#ifndef arch_idle_time +#define arch_idle_time(cpu) 0 +#endif static int show_stat(struct seq_file *p, void *v) { @@ -40,6 +43,7 @@ static int show_stat(struct seq_file *p, void *v) nice = cputime64_add(nice, kstat_cpu(i).cpustat.nice); system = cputime64_add(system, kstat_cpu(i).cpustat.system); idle = cputime64_add(idle, kstat_cpu(i).cpustat.idle); + idle = cputime64_add(idle, arch_idle_time(i)); iowait = cputime64_add(iowait, kstat_cpu(i).cpustat.iowait); irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq); softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq); @@ -69,6 +73,7 @@ static int show_stat(struct seq_file *p, void *v) nice = kstat_cpu(i).cpustat.nice; system = kstat_cpu(i).cpustat.system; idle = kstat_cpu(i).cpustat.idle; + idle = cputime64_add(idle, arch_idle_time(i)); iowait = kstat_cpu(i).cpustat.iowait; irq = kstat_cpu(i).cpustat.irq; softirq = kstat_cpu(i).cpustat.softirq; diff --git a/fs/quota/Makefile b/fs/quota/Makefile index 385a083..68d4f6d 100644 --- a/fs/quota/Makefile +++ b/fs/quota/Makefile @@ -1,12 +1,3 @@ -# -# Makefile for the Linux filesystems. -# -# 14 Sep 2000, Christoph Hellwig <hch@infradead.org> -# Rewritten to use lists instead of if-statements. -# - -obj-y := - obj-$(CONFIG_QUOTA) += dquot.o obj-$(CONFIG_QFMT_V1) += quota_v1.o obj-$(CONFIG_QFMT_V2) += quota_v2.o diff --git a/fs/romfs/internal.h b/fs/romfs/internal.h index 06044a9..95217b8 100644 --- a/fs/romfs/internal.h +++ b/fs/romfs/internal.h @@ -43,5 +43,5 @@ extern int romfs_dev_read(struct super_block *sb, unsigned long pos, void *buf, size_t buflen); extern ssize_t romfs_dev_strnlen(struct super_block *sb, unsigned long pos, size_t maxlen); -extern int romfs_dev_strncmp(struct super_block *sb, unsigned long pos, - const char *str, size_t size); +extern int romfs_dev_strcmp(struct super_block *sb, unsigned long pos, + const char *str, size_t size); diff --git a/fs/romfs/storage.c b/fs/romfs/storage.c index 7e3e1e1..b3208ad 100644 --- a/fs/romfs/storage.c +++ b/fs/romfs/storage.c @@ -67,26 +67,35 @@ static ssize_t romfs_mtd_strnlen(struct super_block *sb, * compare a string to one in a romfs image on MTD * - return 1 if matched, 0 if differ, -ve if error */ -static int romfs_mtd_strncmp(struct super_block *sb, unsigned long pos, - const char *str, size_t size) +static int romfs_mtd_strcmp(struct super_block *sb, unsigned long pos, + const char *str, size_t size) { - u_char buf[16]; + u_char buf[17]; size_t len, segment; int ret; - /* scan the string up to 16 bytes at a time */ + /* scan the string up to 16 bytes at a time, and attempt to grab the + * trailing NUL whilst we're at it */ + buf[0] = 0xff; + while (size > 0) { - segment = min_t(size_t, size, 16); + segment = min_t(size_t, size + 1, 17); ret = ROMFS_MTD_READ(sb, pos, segment, &len, buf); if (ret < 0) return ret; + len--; if (memcmp(buf, str, len) != 0) return 0; + buf[0] = buf[len]; size -= len; pos += len; str += len; } + /* check the trailing NUL was */ + if (buf[0]) + return 0; + return 1; } #endif /* CONFIG_ROMFS_ON_MTD */ @@ -111,6 +120,7 @@ static int romfs_blk_read(struct super_block *sb, unsigned long pos, return -EIO; memcpy(buf, bh->b_data + offset, segment); brelse(bh); + buf += segment; buflen -= segment; pos += segment; } @@ -154,28 +164,48 @@ static ssize_t romfs_blk_strnlen(struct super_block *sb, * compare a string to one in a romfs image on a block device * - return 1 if matched, 0 if differ, -ve if error */ -static int romfs_blk_strncmp(struct super_block *sb, unsigned long pos, - const char *str, size_t size) +static int romfs_blk_strcmp(struct super_block *sb, unsigned long pos, + const char *str, size_t size) { struct buffer_head *bh; unsigned long offset; size_t segment; - bool x; + bool matched, terminated = false; - /* scan the string up to 16 bytes at a time */ + /* compare string up to a block at a time */ while (size > 0) { offset = pos & (ROMBSIZE - 1); segment = min_t(size_t, size, ROMBSIZE - offset); bh = sb_bread(sb, pos >> ROMBSBITS); if (!bh) return -EIO; - x = (memcmp(bh->b_data + offset, str, segment) != 0); - brelse(bh); - if (x) - return 0; + matched = (memcmp(bh->b_data + offset, str, segment) == 0); + size -= segment; pos += segment; str += segment; + if (matched && size == 0 && offset + segment < ROMBSIZE) { + if (!bh->b_data[offset + segment]) + terminated = true; + else + matched = false; + } + brelse(bh); + if (!matched) + return 0; + } + + if (!terminated) { + /* the terminating NUL must be on the first byte of the next + * block */ + BUG_ON((pos & (ROMBSIZE - 1)) != 0); + bh = sb_bread(sb, pos >> ROMBSBITS); + if (!bh) + return -EIO; + matched = !bh->b_data[0]; + brelse(bh); + if (!matched) + return 0; } return 1; @@ -234,10 +264,12 @@ ssize_t romfs_dev_strnlen(struct super_block *sb, /* * compare a string to one in romfs + * - the string to be compared to, str, may not be NUL-terminated; instead the + * string is of the specified size * - return 1 if matched, 0 if differ, -ve if error */ -int romfs_dev_strncmp(struct super_block *sb, unsigned long pos, - const char *str, size_t size) +int romfs_dev_strcmp(struct super_block *sb, unsigned long pos, + const char *str, size_t size) { size_t limit; @@ -246,16 +278,16 @@ int romfs_dev_strncmp(struct super_block *sb, unsigned long pos, return -EIO; if (size > ROMFS_MAXFN) return -ENAMETOOLONG; - if (size > limit - pos) + if (size + 1 > limit - pos) return -EIO; #ifdef CONFIG_ROMFS_ON_MTD if (sb->s_mtd) - return romfs_mtd_strncmp(sb, pos, str, size); + return romfs_mtd_strcmp(sb, pos, str, size); #endif #ifdef CONFIG_ROMFS_ON_BLOCK if (sb->s_bdev) - return romfs_blk_strncmp(sb, pos, str, size); + return romfs_blk_strcmp(sb, pos, str, size); #endif return -EIO; } diff --git a/fs/romfs/super.c b/fs/romfs/super.c index 10ca7d9..c53b5ef 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -240,8 +240,8 @@ static struct dentry *romfs_lookup(struct inode *dir, struct dentry *dentry, goto error; /* try to match the first 16 bytes of name */ - ret = romfs_dev_strncmp(dir->i_sb, offset + ROMFH_SIZE, name, - len); + ret = romfs_dev_strcmp(dir->i_sb, offset + ROMFH_SIZE, name, + len); if (ret < 0) goto error; if (ret == 1) @@ -55,59 +55,54 @@ int vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) EXPORT_SYMBOL(vfs_getattr); -int vfs_stat_fd(int dfd, char __user *name, struct kstat *stat) +int vfs_fstat(unsigned int fd, struct kstat *stat) { - struct path path; - int error; + struct file *f = fget(fd); + int error = -EBADF; - error = user_path_at(dfd, name, LOOKUP_FOLLOW, &path); - if (!error) { - error = vfs_getattr(path.mnt, path.dentry, stat); - path_put(&path); + if (f) { + error = vfs_getattr(f->f_path.mnt, f->f_path.dentry, stat); + fput(f); } return error; } +EXPORT_SYMBOL(vfs_fstat); -int vfs_stat(char __user *name, struct kstat *stat) +int vfs_fstatat(int dfd, char __user *filename, struct kstat *stat, int flag) { - return vfs_stat_fd(AT_FDCWD, name, stat); -} + struct path path; + int error = -EINVAL; + int lookup_flags = 0; -EXPORT_SYMBOL(vfs_stat); + if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0) + goto out; -int vfs_lstat_fd(int dfd, char __user *name, struct kstat *stat) -{ - struct path path; - int error; + if (!(flag & AT_SYMLINK_NOFOLLOW)) + lookup_flags |= LOOKUP_FOLLOW; - error = user_path_at(dfd, name, 0, &path); - if (!error) { - error = vfs_getattr(path.mnt, path.dentry, stat); - path_put(&path); - } + error = user_path_at(dfd, filename, lookup_flags, &path); + if (error) + goto out; + + error = vfs_getattr(path.mnt, path.dentry, stat); + path_put(&path); +out: return error; } +EXPORT_SYMBOL(vfs_fstatat); -int vfs_lstat(char __user *name, struct kstat *stat) +int vfs_stat(char __user *name, struct kstat *stat) { - return vfs_lstat_fd(AT_FDCWD, name, stat); + return vfs_fstatat(AT_FDCWD, name, stat, 0); } +EXPORT_SYMBOL(vfs_stat); -EXPORT_SYMBOL(vfs_lstat); - -int vfs_fstat(unsigned int fd, struct kstat *stat) +int vfs_lstat(char __user *name, struct kstat *stat) { - struct file *f = fget(fd); - int error = -EBADF; - - if (f) { - error = vfs_getattr(f->f_path.mnt, f->f_path.dentry, stat); - fput(f); - } - return error; + return vfs_fstatat(AT_FDCWD, name, stat, AT_SYMLINK_NOFOLLOW); } +EXPORT_SYMBOL(vfs_lstat); -EXPORT_SYMBOL(vfs_fstat); #ifdef __ARCH_WANT_OLD_STAT @@ -155,23 +150,25 @@ static int cp_old_stat(struct kstat *stat, struct __old_kernel_stat __user * sta SYSCALL_DEFINE2(stat, char __user *, filename, struct __old_kernel_stat __user *, statbuf) { struct kstat stat; - int error = vfs_stat_fd(AT_FDCWD, filename, &stat); + int error; - if (!error) - error = cp_old_stat(&stat, statbuf); + error = vfs_stat(filename, &stat); + if (error) + return error; - return error; + return cp_old_stat(&stat, statbuf); } SYSCALL_DEFINE2(lstat, char __user *, filename, struct __old_kernel_stat __user *, statbuf) { struct kstat stat; - int error = vfs_lstat_fd(AT_FDCWD, filename, &stat); + int error; - if (!error) - error = cp_old_stat(&stat, statbuf); + error = vfs_lstat(filename, &stat); + if (error) + return error; - return error; + return cp_old_stat(&stat, statbuf); } SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, statbuf) @@ -240,23 +237,23 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf) SYSCALL_DEFINE2(newstat, char __user *, filename, struct stat __user *, statbuf) { struct kstat stat; - int error = vfs_stat_fd(AT_FDCWD, filename, &stat); - - if (!error) - error = cp_new_stat(&stat, statbuf); + int error = vfs_stat(filename, &stat); - return error; + if (error) + return error; + return cp_new_stat(&stat, statbuf); } SYSCALL_DEFINE2(newlstat, char __user *, filename, struct stat __user *, statbuf) { struct kstat stat; - int error = vfs_lstat_fd(AT_FDCWD, filename, &stat); + int error; - if (!error) - error = cp_new_stat(&stat, statbuf); + error = vfs_lstat(filename, &stat); + if (error) + return error; - return error; + return cp_new_stat(&stat, statbuf); } #if !defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_SYS_NEWFSTATAT) @@ -264,21 +261,12 @@ SYSCALL_DEFINE4(newfstatat, int, dfd, char __user *, filename, struct stat __user *, statbuf, int, flag) { struct kstat stat; - int error = -EINVAL; - - if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0) - goto out; - - if (flag & AT_SYMLINK_NOFOLLOW) - error = vfs_lstat_fd(dfd, filename, &stat); - else - error = vfs_stat_fd(dfd, filename, &stat); - - if (!error) - error = cp_new_stat(&stat, statbuf); + int error; -out: - return error; + error = vfs_fstatat(dfd, filename, &stat, flag); + if (error) + return error; + return cp_new_stat(&stat, statbuf); } #endif @@ -404,21 +392,12 @@ SYSCALL_DEFINE4(fstatat64, int, dfd, char __user *, filename, struct stat64 __user *, statbuf, int, flag) { struct kstat stat; - int error = -EINVAL; - - if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0) - goto out; - - if (flag & AT_SYMLINK_NOFOLLOW) - error = vfs_lstat_fd(dfd, filename, &stat); - else - error = vfs_stat_fd(dfd, filename, &stat); - - if (!error) - error = cp_new_stat64(&stat, statbuf); + int error; -out: - return error; + error = vfs_fstatat(dfd, filename, &stat, flag); + if (error) + return error; + return cp_new_stat64(&stat, statbuf); } #endif /* __ARCH_WANT_STAT64 */ diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c index 93e0c02..9345806 100644 --- a/fs/sysfs/bin.c +++ b/fs/sysfs/bin.c @@ -157,14 +157,9 @@ static ssize_t write(struct file *file, const char __user *userbuf, count = size - offs; } - temp = kmalloc(count, GFP_KERNEL); - if (!temp) - return -ENOMEM; - - if (copy_from_user(temp, userbuf, count)) { - count = -EFAULT; - goto out_free; - } + temp = memdup_user(userbuf, count); + if (IS_ERR(temp)) + return PTR_ERR(temp); mutex_lock(&bb->mutex); @@ -176,8 +171,6 @@ static ssize_t write(struct file *file, const char __user *userbuf, if (count > 0) *off = offs + count; -out_free: - kfree(temp); return count; } @@ -237,13 +237,9 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value, if (size) { if (size > XATTR_SIZE_MAX) return -E2BIG; - kvalue = kmalloc(size, GFP_KERNEL); - if (!kvalue) - return -ENOMEM; - if (copy_from_user(kvalue, value, size)) { - kfree(kvalue); - return -EFAULT; - } + kvalue = memdup_user(value, size); + if (IS_ERR(kvalue)) + return PTR_ERR(kvalue); } error = vfs_setxattr(d, kname, kvalue, size, flags); diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index d0b4994..34eaab6 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -489,17 +489,12 @@ xfs_attrmulti_attr_set( if (len > XATTR_SIZE_MAX) return EINVAL; - kbuf = kmalloc(len, GFP_KERNEL); - if (!kbuf) - return ENOMEM; - - if (copy_from_user(kbuf, ubuf, len)) - goto out_kfree; + kbuf = memdup_user(ubuf, len); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); error = xfs_attr_set(XFS_I(inode), name, kbuf, len, flags); - out_kfree: - kfree(kbuf); return error; } @@ -540,20 +535,16 @@ xfs_attrmulti_by_handle( if (!size || size > 16 * PAGE_SIZE) goto out_dput; - error = ENOMEM; - ops = kmalloc(size, GFP_KERNEL); - if (!ops) + ops = memdup_user(am_hreq.ops, size); + if (IS_ERR(ops)) { + error = PTR_ERR(ops); goto out_dput; - - error = EFAULT; - if (copy_from_user(ops, am_hreq.ops, size)) - goto out_kfree_ops; + } attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL); if (!attr_name) goto out_kfree_ops; - error = 0; for (i = 0; i < am_hreq.opcount; i++) { ops[i].am_error = strncpy_from_user(attr_name, diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c index c70c4e3..0882d16 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl32.c +++ b/fs/xfs/linux-2.6/xfs_ioctl32.c @@ -427,20 +427,16 @@ xfs_compat_attrmulti_by_handle( if (!size || size > 16 * PAGE_SIZE) goto out_dput; - error = ENOMEM; - ops = kmalloc(size, GFP_KERNEL); - if (!ops) + ops = memdup_user(compat_ptr(am_hreq.ops), size); + if (IS_ERR(ops)) { + error = PTR_ERR(ops); goto out_dput; - - error = EFAULT; - if (copy_from_user(ops, compat_ptr(am_hreq.ops), size)) - goto out_kfree_ops; + } attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL); if (!attr_name) goto out_kfree_ops; - error = 0; for (i = 0; i < am_hreq.opcount; i++) { ops[i].am_error = strncpy_from_user(attr_name, |