From 6b82f3cb2d480b7714eb0ff61aee99c22160389e Mon Sep 17 00:00:00 2001 From: Chuck Ebbert Date: Tue, 14 Apr 2009 07:37:40 -0400 Subject: ext4: really print the find_group_flex fallback warning only once Missing braces caused the warning to print more than once. Signed-Off-By: Chuck Ebbert Signed-off-by: "Theodore Ts'o" --- fs/ext4/ialloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 47b84e8..cbce5aa 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -831,11 +831,12 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) ret2 = find_group_flex(sb, dir, &group); if (ret2 == -1) { ret2 = find_group_other(sb, dir, &group, mode); - if (ret2 == 0 && once) + if (ret2 == 0 && once) { once = 0; printk(KERN_NOTICE "ext4: find_group_flex " "failed, fallback succeeded dir %lu\n", dir->i_ino); + } } goto got_group; } -- cgit v1.1 From 67c457a8c378a006a34d92f9bd3078a80a92f250 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 14 Apr 2009 07:50:56 -0400 Subject: jbd2: use SWRITE_SYNC_PLUG when writing synchronous revoke records The revoke records must be written using the same way as the rest of the blocks during the commit process; that is, either marked as synchronous writes or as asynchornous writes. Signed-off-by: "Theodore Ts'o" --- fs/jbd2/commit.c | 3 ++- fs/jbd2/revoke.c | 21 ++++++++++++--------- 2 files changed, 14 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 073c8c3..0b7d3b8 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -506,7 +506,8 @@ void jbd2_journal_commit_transaction(journal_t *journal) if (err) jbd2_journal_abort(journal, err); - jbd2_journal_write_revoke_records(journal, commit_transaction); + jbd2_journal_write_revoke_records(journal, commit_transaction, + write_op); jbd_debug(3, "JBD: commit phase 2\n"); diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index bbe6d59..a360b06 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -86,6 +86,7 @@ #include #include #include +#include #endif #include @@ -118,8 +119,8 @@ struct jbd2_revoke_table_s #ifdef __KERNEL__ static void write_one_revoke_record(journal_t *, transaction_t *, struct journal_head **, int *, - struct jbd2_revoke_record_s *); -static void flush_descriptor(journal_t *, struct journal_head *, int); + struct jbd2_revoke_record_s *, int); +static void flush_descriptor(journal_t *, struct journal_head *, int, int); #endif /* Utility functions to maintain the revoke table */ @@ -499,7 +500,8 @@ void jbd2_journal_switch_revoke_table(journal_t *journal) * revoke hash, deleting the entries as we go. */ void jbd2_journal_write_revoke_records(journal_t *journal, - transaction_t *transaction) + transaction_t *transaction, + int write_op) { struct journal_head *descriptor; struct jbd2_revoke_record_s *record; @@ -523,14 +525,14 @@ void jbd2_journal_write_revoke_records(journal_t *journal, hash_list->next; write_one_revoke_record(journal, transaction, &descriptor, &offset, - record); + record, write_op); count++; list_del(&record->hash); kmem_cache_free(jbd2_revoke_record_cache, record); } } if (descriptor) - flush_descriptor(journal, descriptor, offset); + flush_descriptor(journal, descriptor, offset, write_op); jbd_debug(1, "Wrote %d revoke records\n", count); } @@ -543,7 +545,8 @@ static void write_one_revoke_record(journal_t *journal, transaction_t *transaction, struct journal_head **descriptorp, int *offsetp, - struct jbd2_revoke_record_s *record) + struct jbd2_revoke_record_s *record, + int write_op) { struct journal_head *descriptor; int offset; @@ -562,7 +565,7 @@ static void write_one_revoke_record(journal_t *journal, /* Make sure we have a descriptor with space left for the record */ if (descriptor) { if (offset == journal->j_blocksize) { - flush_descriptor(journal, descriptor, offset); + flush_descriptor(journal, descriptor, offset, write_op); descriptor = NULL; } } @@ -607,7 +610,7 @@ static void write_one_revoke_record(journal_t *journal, static void flush_descriptor(journal_t *journal, struct journal_head *descriptor, - int offset) + int offset, int write_op) { jbd2_journal_revoke_header_t *header; struct buffer_head *bh = jh2bh(descriptor); @@ -622,7 +625,7 @@ static void flush_descriptor(journal_t *journal, set_buffer_jwrite(bh); BUFFER_TRACE(bh, "write"); set_buffer_dirty(bh); - ll_rw_block(SWRITE, 1, &bh); + ll_rw_block((write_op == WRITE) ? SWRITE : SWRITE_SYNC_PLUG, 1, &bh); } #endif -- cgit v1.1 From 38d726d153cfe5efe5fe22d28d36ab382dda3a5c Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 14 Apr 2009 10:10:47 -0400 Subject: jbd: use SWRITE_SYNC_PLUG when writing synchronous revoke records The revoke records must be written using the same way as the rest of the blocks during the commit process; that is, either marked as synchronous writes or as asynchornous writes. Signed-off-by: "Theodore Ts'o" --- fs/jbd/commit.c | 2 +- fs/jbd/revoke.c | 20 +++++++++++--------- 2 files changed, 12 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index a8e8513..06560c5 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -502,7 +502,7 @@ void journal_commit_transaction(journal_t *journal) err = 0; } - journal_write_revoke_records(journal, commit_transaction); + journal_write_revoke_records(journal, commit_transaction, write_op); /* * If we found any dirty or locked buffers, then we should have diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c index c7bd649..1b1a06e 100644 --- a/fs/jbd/revoke.c +++ b/fs/jbd/revoke.c @@ -67,6 +67,7 @@ #include #include #include +#include #endif #include @@ -99,8 +100,8 @@ struct jbd_revoke_table_s #ifdef __KERNEL__ static void write_one_revoke_record(journal_t *, transaction_t *, struct journal_head **, int *, - struct jbd_revoke_record_s *); -static void flush_descriptor(journal_t *, struct journal_head *, int); + struct jbd_revoke_record_s *, int); +static void flush_descriptor(journal_t *, struct journal_head *, int, int); #endif /* Utility functions to maintain the revoke table */ @@ -486,7 +487,7 @@ void journal_switch_revoke_table(journal_t *journal) */ void journal_write_revoke_records(journal_t *journal, - transaction_t *transaction) + transaction_t *transaction, int write_op) { struct journal_head *descriptor; struct jbd_revoke_record_s *record; @@ -510,14 +511,14 @@ void journal_write_revoke_records(journal_t *journal, hash_list->next; write_one_revoke_record(journal, transaction, &descriptor, &offset, - record); + record, write_op); count++; list_del(&record->hash); kmem_cache_free(revoke_record_cache, record); } } if (descriptor) - flush_descriptor(journal, descriptor, offset); + flush_descriptor(journal, descriptor, offset, write_op); jbd_debug(1, "Wrote %d revoke records\n", count); } @@ -530,7 +531,8 @@ static void write_one_revoke_record(journal_t *journal, transaction_t *transaction, struct journal_head **descriptorp, int *offsetp, - struct jbd_revoke_record_s *record) + struct jbd_revoke_record_s *record, + int write_op) { struct journal_head *descriptor; int offset; @@ -549,7 +551,7 @@ static void write_one_revoke_record(journal_t *journal, /* Make sure we have a descriptor with space left for the record */ if (descriptor) { if (offset == journal->j_blocksize) { - flush_descriptor(journal, descriptor, offset); + flush_descriptor(journal, descriptor, offset, write_op); descriptor = NULL; } } @@ -586,7 +588,7 @@ static void write_one_revoke_record(journal_t *journal, static void flush_descriptor(journal_t *journal, struct journal_head *descriptor, - int offset) + int offset, int write_op) { journal_revoke_header_t *header; struct buffer_head *bh = jh2bh(descriptor); @@ -601,7 +603,7 @@ static void flush_descriptor(journal_t *journal, set_buffer_jwrite(bh); BUFFER_TRACE(bh, "write"); set_buffer_dirty(bh); - ll_rw_block(SWRITE, 1, &bh); + ll_rw_block((write_op == WRITE) ? SWRITE : SWRITE_SYNC_PLUG, 1, &bh); } #endif -- cgit v1.1 From 52fcd11c0900b0cbc584eeda12a6e27dd6c9d046 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Mon, 20 Apr 2009 08:58:45 +0100 Subject: GFS2: Clear dirty bit at end of inode glock sync The dirty bit can get set during the inode glock sync. Its too complicated to change that at the moment, so this is the quick fix - to clear the bit again at the end of the function. Signed-off-by: Steven Whitehouse --- fs/gfs2/glops.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index bf23a62..70f87f4 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -156,6 +156,12 @@ static void inode_go_sync(struct gfs2_glock *gl) error = filemap_fdatawait(metamapping); mapping_set_error(metamapping, error); gfs2_ail_empty_gl(gl); + /* + * Writeback of the data mapping may cause the dirty flag to be set + * so we have to clear it again here. + */ + smp_mb__before_clear_bit(); + clear_bit(GLF_DIRTY, &gl->gl_flags); } /** -- cgit v1.1 From e56985da455b9dc0591b8cb2006cc94b6f4fb0f4 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Mon, 20 Apr 2009 09:45:54 +0100 Subject: GFS2: Fix page_mkwrite() return code This allows for the possibility of returning VM_FAULT_OOM as well as VM_FAULT_SIGBUS. This ensures that the correct action is taken. Signed-off-by: Steven Whitehouse --- fs/gfs2/ops_file.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c index 101caf3..5d82e91 100644 --- a/fs/gfs2/ops_file.c +++ b/fs/gfs2/ops_file.c @@ -413,7 +413,9 @@ out_unlock: gfs2_glock_dq(&gh); out: gfs2_holder_uninit(&gh); - if (ret) + if (ret == -ENOMEM) + ret = VM_FAULT_OOM; + else if (ret) ret = VM_FAULT_SIGBUS; return ret; } -- cgit v1.1 From ffbd517d5a8c8e93ddd11046434fb029f3df73aa Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 20 Apr 2009 15:50:09 -0400 Subject: Btrfs: use WRITE_SYNC for synchronous writes Part of reducing fsync/O_SYNC/O_DIRECT latencies is using WRITE_SYNC for writes we plan on waiting on in the near future. This patch mirrors recent changes in other filesystems and the generic code to use WRITE_SYNC when WB_SYNC_ALL is passed and to use WRITE_SYNC for other latency critical writes. Btrfs uses async worker threads for checksumming before the write is done, and then again to actually submit the bios. The bio submission code just runs a per-device list of bios that need to be sent down the pipe. This list is split into low priority and high priority lists so the WRITE_SYNC IO happens first. Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 4 +- fs/btrfs/extent_io.c | 44 +++++++++++------ fs/btrfs/ordered-data.c | 2 +- fs/btrfs/volumes.c | 124 +++++++++++++++++++++++++++++++++++++----------- fs/btrfs/volumes.h | 13 ++++- 5 files changed, 141 insertions(+), 46 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 92caa80..fec18b4 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2095,10 +2095,10 @@ static int write_dev_supers(struct btrfs_device *device, device->barriers = 0; get_bh(bh); lock_buffer(bh); - ret = submit_bh(WRITE, bh); + ret = submit_bh(WRITE_SYNC, bh); } } else { - ret = submit_bh(WRITE, bh); + ret = submit_bh(WRITE_SYNC, bh); } if (!ret && wait) { diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index eb2bee8..483b672 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -50,7 +50,10 @@ struct extent_page_data { /* tells writepage not to lock the state bits for this range * it still does the unlocking */ - int extent_locked; + unsigned int extent_locked:1; + + /* tells the submit_bio code to use a WRITE_SYNC */ + unsigned int sync_io:1; }; int __init extent_io_init(void) @@ -2136,8 +2139,14 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, u64 delalloc_end; int page_started; int compressed; + int write_flags; unsigned long nr_written = 0; + if (wbc->sync_mode == WB_SYNC_ALL) + write_flags = WRITE_SYNC_PLUG; + else + write_flags = WRITE; + WARN_ON(!PageLocked(page)); pg_offset = i_size & (PAGE_CACHE_SIZE - 1); if (page->index > end_index || @@ -2314,9 +2323,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, (unsigned long long)end); } - ret = submit_extent_page(WRITE, tree, page, sector, - iosize, pg_offset, bdev, - &epd->bio, max_nr, + ret = submit_extent_page(write_flags, tree, page, + sector, iosize, pg_offset, + bdev, &epd->bio, max_nr, end_bio_extent_writepage, 0, 0, 0); if (ret) @@ -2460,15 +2469,23 @@ retry: return ret; } -static noinline void flush_write_bio(void *data) +static void flush_epd_write_bio(struct extent_page_data *epd) { - struct extent_page_data *epd = data; if (epd->bio) { - submit_one_bio(WRITE, epd->bio, 0, 0); + if (epd->sync_io) + submit_one_bio(WRITE_SYNC, epd->bio, 0, 0); + else + submit_one_bio(WRITE, epd->bio, 0, 0); epd->bio = NULL; } } +static noinline void flush_write_bio(void *data) +{ + struct extent_page_data *epd = data; + flush_epd_write_bio(epd); +} + int extent_write_full_page(struct extent_io_tree *tree, struct page *page, get_extent_t *get_extent, struct writeback_control *wbc) @@ -2480,6 +2497,7 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page, .tree = tree, .get_extent = get_extent, .extent_locked = 0, + .sync_io = wbc->sync_mode == WB_SYNC_ALL, }; struct writeback_control wbc_writepages = { .bdi = wbc->bdi, @@ -2490,13 +2508,11 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page, .range_end = (loff_t)-1, }; - ret = __extent_writepage(page, wbc, &epd); extent_write_cache_pages(tree, mapping, &wbc_writepages, __extent_writepage, &epd, flush_write_bio); - if (epd.bio) - submit_one_bio(WRITE, epd.bio, 0, 0); + flush_epd_write_bio(&epd); return ret; } @@ -2515,6 +2531,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, .tree = tree, .get_extent = get_extent, .extent_locked = 1, + .sync_io = mode == WB_SYNC_ALL, }; struct writeback_control wbc_writepages = { .bdi = inode->i_mapping->backing_dev_info, @@ -2540,8 +2557,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, start += PAGE_CACHE_SIZE; } - if (epd.bio) - submit_one_bio(WRITE, epd.bio, 0, 0); + flush_epd_write_bio(&epd); return ret; } @@ -2556,13 +2572,13 @@ int extent_writepages(struct extent_io_tree *tree, .tree = tree, .get_extent = get_extent, .extent_locked = 0, + .sync_io = wbc->sync_mode == WB_SYNC_ALL, }; ret = extent_write_cache_pages(tree, mapping, wbc, __extent_writepage, &epd, flush_write_bio); - if (epd.bio) - submit_one_bio(WRITE, epd.bio, 0, 0); + flush_epd_write_bio(&epd); return ret; } diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 53c87b1..d6f0806 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -489,7 +489,7 @@ again: /* start IO across the range first to instantiate any delalloc * extents */ - btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_NONE); + btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL); /* The compression code will leave pages locked but return from * writepage without setting the page writeback. Starting again diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index e0913e4..e53835b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -125,6 +125,20 @@ static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) return NULL; } +static void requeue_list(struct btrfs_pending_bios *pending_bios, + struct bio *head, struct bio *tail) +{ + + struct bio *old_head; + + old_head = pending_bios->head; + pending_bios->head = head; + if (pending_bios->tail) + tail->bi_next = old_head; + else + pending_bios->tail = tail; +} + /* * we try to collect pending bios for a device so we don't get a large * number of procs sending bios down to the same device. This greatly @@ -141,10 +155,12 @@ static noinline int run_scheduled_bios(struct btrfs_device *device) struct bio *pending; struct backing_dev_info *bdi; struct btrfs_fs_info *fs_info; + struct btrfs_pending_bios *pending_bios; struct bio *tail; struct bio *cur; int again = 0; - unsigned long num_run = 0; + unsigned long num_run; + unsigned long num_sync_run; unsigned long limit; unsigned long last_waited = 0; @@ -153,20 +169,30 @@ static noinline int run_scheduled_bios(struct btrfs_device *device) limit = btrfs_async_submit_limit(fs_info); limit = limit * 2 / 3; + /* we want to make sure that every time we switch from the sync + * list to the normal list, we unplug + */ + num_sync_run = 0; + loop: spin_lock(&device->io_lock); + num_run = 0; loop_lock: + /* take all the bios off the list at once and process them * later on (without the lock held). But, remember the * tail and other pointers so the bios can be properly reinserted * into the list if we hit congestion */ - pending = device->pending_bios; - tail = device->pending_bio_tail; + if (device->pending_sync_bios.head) + pending_bios = &device->pending_sync_bios; + else + pending_bios = &device->pending_bios; + + pending = pending_bios->head; + tail = pending_bios->tail; WARN_ON(pending && !tail); - device->pending_bios = NULL; - device->pending_bio_tail = NULL; /* * if pending was null this time around, no bios need processing @@ -176,16 +202,41 @@ loop_lock: * device->running_pending is used to synchronize with the * schedule_bio code. */ - if (pending) { - again = 1; - device->running_pending = 1; - } else { + if (device->pending_sync_bios.head == NULL && + device->pending_bios.head == NULL) { again = 0; device->running_pending = 0; + } else { + again = 1; + device->running_pending = 1; } + + pending_bios->head = NULL; + pending_bios->tail = NULL; + spin_unlock(&device->io_lock); + /* + * if we're doing the regular priority list, make sure we unplug + * for any high prio bios we've sent down + */ + if (pending_bios == &device->pending_bios && num_sync_run > 0) { + num_sync_run = 0; + blk_run_backing_dev(bdi, NULL); + } + while (pending) { + + rmb(); + if (pending_bios != &device->pending_sync_bios && + device->pending_sync_bios.head && + num_run > 16) { + cond_resched(); + spin_lock(&device->io_lock); + requeue_list(pending_bios, pending, tail); + goto loop_lock; + } + cur = pending; pending = pending->bi_next; cur->bi_next = NULL; @@ -196,10 +247,18 @@ loop_lock: wake_up(&fs_info->async_submit_wait); BUG_ON(atomic_read(&cur->bi_cnt) == 0); - bio_get(cur); submit_bio(cur->bi_rw, cur); - bio_put(cur); num_run++; + if (bio_sync(cur)) + num_sync_run++; + + if (need_resched()) { + if (num_sync_run) { + blk_run_backing_dev(bdi, NULL); + num_sync_run = 0; + } + cond_resched(); + } /* * we made progress, there is more work to do and the bdi @@ -208,7 +267,6 @@ loop_lock: */ if (pending && bdi_write_congested(bdi) && num_run > 16 && fs_info->fs_devices->open_devices > 1) { - struct bio *old_head; struct io_context *ioc; ioc = current->io_context; @@ -233,17 +291,17 @@ loop_lock: * against it before looping */ last_waited = ioc->last_waited; + if (need_resched()) { + if (num_sync_run) { + blk_run_backing_dev(bdi, NULL); + num_sync_run = 0; + } + cond_resched(); + } continue; } spin_lock(&device->io_lock); - - old_head = device->pending_bios; - device->pending_bios = pending; - if (device->pending_bio_tail) - tail->bi_next = old_head; - else - device->pending_bio_tail = tail; - + requeue_list(pending_bios, pending, tail); device->running_pending = 1; spin_unlock(&device->io_lock); @@ -251,11 +309,18 @@ loop_lock: goto done; } } + + if (num_sync_run) { + num_sync_run = 0; + blk_run_backing_dev(bdi, NULL); + } + + cond_resched(); if (again) goto loop; spin_lock(&device->io_lock); - if (device->pending_bios) + if (device->pending_bios.head || device->pending_sync_bios.head) goto loop_lock; spin_unlock(&device->io_lock); @@ -2497,7 +2562,7 @@ again: max_errors = 1; } } - if (multi_ret && rw == WRITE && + if (multi_ret && (rw & (1 << BIO_RW)) && stripes_allocated < stripes_required) { stripes_allocated = map->num_stripes; free_extent_map(em); @@ -2762,6 +2827,7 @@ static noinline int schedule_bio(struct btrfs_root *root, int rw, struct bio *bio) { int should_queue = 1; + struct btrfs_pending_bios *pending_bios; /* don't bother with additional async steps for reads, right now */ if (!(rw & (1 << BIO_RW))) { @@ -2783,13 +2849,17 @@ static noinline int schedule_bio(struct btrfs_root *root, bio->bi_rw |= rw; spin_lock(&device->io_lock); + if (bio_sync(bio)) + pending_bios = &device->pending_sync_bios; + else + pending_bios = &device->pending_bios; - if (device->pending_bio_tail) - device->pending_bio_tail->bi_next = bio; + if (pending_bios->tail) + pending_bios->tail->bi_next = bio; - device->pending_bio_tail = bio; - if (!device->pending_bios) - device->pending_bios = bio; + pending_bios->tail = bio; + if (!pending_bios->head) + pending_bios->head = bio; if (device->running_pending) should_queue = 0; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 2185de7..5836327 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -23,13 +23,22 @@ #include "async-thread.h" struct buffer_head; +struct btrfs_pending_bios { + struct bio *head; + struct bio *tail; +}; + struct btrfs_device { struct list_head dev_list; struct list_head dev_alloc_list; struct btrfs_fs_devices *fs_devices; struct btrfs_root *dev_root; - struct bio *pending_bios; - struct bio *pending_bio_tail; + + /* regular prio bios */ + struct btrfs_pending_bios pending_bios; + /* WRITE_SYNC bios */ + struct btrfs_pending_bios pending_sync_bios; + int running_pending; u64 generation; -- cgit v1.1 From d313d7a31a752c88f7288692bd98e66d0789779b Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 20 Apr 2009 15:50:09 -0400 Subject: Btrfs: add a priority queue to the async thread helpers Btrfs is using WRITE_SYNC_PLUG to send down synchronous IOs with a higher priority. But, the checksumming helper threads prevent it from being fully effective. There are two problems. First, a big queue of pending checksumming will delay the synchronous IO behind other lower priority writes. Second, the checksumming uses an ordered async work queue. The ordering makes sure that IOs are sent to the block layer in the same order they are sent to the checksumming threads. Usually this gives us less seeky IO. But, when we start mixing IO priorities, the lower priority IO can delay the higher priority IO. This patch solves both problems by adding a high priority list to the async helper threads, and a new btrfs_set_work_high_prio(), which is used to make put a new async work item onto the higher priority list. The ordering is still done on high priority IO, but all of the high priority bios are ordered separately from the low priority bios. This ordering is purely an IO optimization, it is not involved in data or metadata integrity. Signed-off-by: Chris Mason --- fs/btrfs/async-thread.c | 60 ++++++++++++++++++++++++++++++++++++++----------- fs/btrfs/async-thread.h | 2 ++ fs/btrfs/disk-io.c | 5 +++++ fs/btrfs/extent_io.c | 2 +- fs/btrfs/file.c | 2 +- 5 files changed, 56 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 51bfdfc..502c3d6 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -25,6 +25,7 @@ #define WORK_QUEUED_BIT 0 #define WORK_DONE_BIT 1 #define WORK_ORDER_DONE_BIT 2 +#define WORK_HIGH_PRIO_BIT 3 /* * container for the kthread task pointer and the list of pending work @@ -36,6 +37,7 @@ struct btrfs_worker_thread { /* list of struct btrfs_work that are waiting for service */ struct list_head pending; + struct list_head prio_pending; /* list of worker threads from struct btrfs_workers */ struct list_head worker_list; @@ -103,10 +105,16 @@ static noinline int run_ordered_completions(struct btrfs_workers *workers, spin_lock_irqsave(&workers->lock, flags); - while (!list_empty(&workers->order_list)) { - work = list_entry(workers->order_list.next, - struct btrfs_work, order_list); - + while (1) { + if (!list_empty(&workers->prio_order_list)) { + work = list_entry(workers->prio_order_list.next, + struct btrfs_work, order_list); + } else if (!list_empty(&workers->order_list)) { + work = list_entry(workers->order_list.next, + struct btrfs_work, order_list); + } else { + break; + } if (!test_bit(WORK_DONE_BIT, &work->flags)) break; @@ -143,8 +151,14 @@ static int worker_loop(void *arg) do { spin_lock_irq(&worker->lock); again_locked: - while (!list_empty(&worker->pending)) { - cur = worker->pending.next; + while (1) { + if (!list_empty(&worker->prio_pending)) + cur = worker->prio_pending.next; + else if (!list_empty(&worker->pending)) + cur = worker->pending.next; + else + break; + work = list_entry(cur, struct btrfs_work, list); list_del(&work->list); clear_bit(WORK_QUEUED_BIT, &work->flags); @@ -163,7 +177,6 @@ again_locked: spin_lock_irq(&worker->lock); check_idle_worker(worker); - } if (freezing(current)) { worker->working = 0; @@ -178,7 +191,8 @@ again_locked: * jump_in? */ smp_mb(); - if (!list_empty(&worker->pending)) + if (!list_empty(&worker->pending) || + !list_empty(&worker->prio_pending)) continue; /* @@ -191,7 +205,8 @@ again_locked: */ schedule_timeout(1); smp_mb(); - if (!list_empty(&worker->pending)) + if (!list_empty(&worker->pending) || + !list_empty(&worker->prio_pending)) continue; if (kthread_should_stop()) @@ -200,7 +215,8 @@ again_locked: /* still no more work?, sleep for real */ spin_lock_irq(&worker->lock); set_current_state(TASK_INTERRUPTIBLE); - if (!list_empty(&worker->pending)) + if (!list_empty(&worker->pending) || + !list_empty(&worker->prio_pending)) goto again_locked; /* @@ -248,6 +264,7 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max) INIT_LIST_HEAD(&workers->worker_list); INIT_LIST_HEAD(&workers->idle_list); INIT_LIST_HEAD(&workers->order_list); + INIT_LIST_HEAD(&workers->prio_order_list); spin_lock_init(&workers->lock); workers->max_workers = max; workers->idle_thresh = 32; @@ -273,6 +290,7 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) } INIT_LIST_HEAD(&worker->pending); + INIT_LIST_HEAD(&worker->prio_pending); INIT_LIST_HEAD(&worker->worker_list); spin_lock_init(&worker->lock); atomic_set(&worker->num_pending, 0); @@ -396,7 +414,10 @@ int btrfs_requeue_work(struct btrfs_work *work) goto out; spin_lock_irqsave(&worker->lock, flags); - list_add_tail(&work->list, &worker->pending); + if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) + list_add_tail(&work->list, &worker->prio_pending); + else + list_add_tail(&work->list, &worker->pending); atomic_inc(&worker->num_pending); /* by definition we're busy, take ourselves off the idle @@ -422,6 +443,11 @@ out: return 0; } +void btrfs_set_work_high_prio(struct btrfs_work *work) +{ + set_bit(WORK_HIGH_PRIO_BIT, &work->flags); +} + /* * places a struct btrfs_work into the pending queue of one of the kthreads */ @@ -438,7 +464,12 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) worker = find_worker(workers); if (workers->ordered) { spin_lock_irqsave(&workers->lock, flags); - list_add_tail(&work->order_list, &workers->order_list); + if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) { + list_add_tail(&work->order_list, + &workers->prio_order_list); + } else { + list_add_tail(&work->order_list, &workers->order_list); + } spin_unlock_irqrestore(&workers->lock, flags); } else { INIT_LIST_HEAD(&work->order_list); @@ -446,7 +477,10 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) spin_lock_irqsave(&worker->lock, flags); - list_add_tail(&work->list, &worker->pending); + if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) + list_add_tail(&work->list, &worker->prio_pending); + else + list_add_tail(&work->list, &worker->pending); atomic_inc(&worker->num_pending); check_busy_worker(worker); diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index 31be4ed..1b511c1 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -85,6 +85,7 @@ struct btrfs_workers { * of work items waiting for completion */ struct list_head order_list; + struct list_head prio_order_list; /* lock for finding the next worker thread to queue on */ spinlock_t lock; @@ -98,4 +99,5 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers); int btrfs_stop_workers(struct btrfs_workers *workers); void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max); int btrfs_requeue_work(struct btrfs_work *work); +void btrfs_set_work_high_prio(struct btrfs_work *work); #endif diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index fec18b4..a6b8374 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -579,6 +579,10 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, async->bio_flags = bio_flags; atomic_inc(&fs_info->nr_async_submits); + + if (rw & (1 << BIO_RW_SYNCIO)) + btrfs_set_work_high_prio(&async->work); + btrfs_queue_worker(&fs_info->workers, &async->work); #if 0 int limit = btrfs_async_submit_limit(fs_info); @@ -656,6 +660,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 0); } + /* * kthread helpers are used to submit writes so that checksumming * can happen in parallel across all CPUs diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 483b672..5d66cb2 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2501,7 +2501,7 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page, }; struct writeback_control wbc_writepages = { .bdi = wbc->bdi, - .sync_mode = WB_SYNC_NONE, + .sync_mode = wbc->sync_mode, .older_than_this = NULL, .nr_to_write = 64, .range_start = page_offset(page) + PAGE_CACHE_SIZE, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 9c9fb46..e21c006 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1131,7 +1131,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, if (will_write) { btrfs_fdatawrite_range(inode->i_mapping, pos, pos + write_bytes - 1, - WB_SYNC_NONE); + WB_SYNC_ALL); } else { balance_dirty_pages_ratelimited_nr(inode->i_mapping, num_pages); -- cgit v1.1 From 11c8349b4eb68f2b04cd8ece577377e6c0e5dd4b Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 20 Apr 2009 15:50:09 -0400 Subject: Btrfs: fix oops on page->mapping->host during writepage The extent_io writepage call updates the writepage index in the inode as it makes progress. But, it was doing the update after unlocking the page, which isn't legal because page->mapping can't be trusted once the page is unlocked. This lead to an oops, especially common with compression turned on. The fix here is to update the writeback index before unlocking the page. Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 40 ++++++++++++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 5d66cb2..05a1c42 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2104,6 +2104,16 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page, return ret; } +static noinline void update_nr_written(struct page *page, + struct writeback_control *wbc, + unsigned long nr_written) +{ + wbc->nr_to_write -= nr_written; + if (wbc->range_cyclic || (wbc->nr_to_write > 0 && + wbc->range_start == 0 && wbc->range_end == LLONG_MAX)) + page->mapping->writeback_index = page->index + nr_written; +} + /* * the writepage semantics are similar to regular writepage. extent * records are inserted to lock ranges in the tree, and as dirty areas @@ -2173,6 +2183,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, delalloc_end = 0; page_started = 0; if (!epd->extent_locked) { + /* + * make sure the wbc mapping index is at least updated + * to this page. + */ + update_nr_written(page, wbc, 0); + while (delalloc_end < page_end) { nr_delalloc = find_lock_delalloc_range(inode, tree, page, @@ -2194,7 +2210,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, */ if (page_started) { ret = 0; - goto update_nr_written; + /* + * we've unlocked the page, so we can't update + * the mapping's writeback index, just update + * nr_to_write. + */ + wbc->nr_to_write -= nr_written; + goto done_unlocked; } } lock_extent(tree, start, page_end, GFP_NOFS); @@ -2207,13 +2229,18 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, if (ret == -EAGAIN) { unlock_extent(tree, start, page_end, GFP_NOFS); redirty_page_for_writepage(wbc, page); + update_nr_written(page, wbc, nr_written); unlock_page(page); ret = 0; - goto update_nr_written; + goto done_unlocked; } } - nr_written++; + /* + * we don't want to touch the inode after unlocking the page, + * so we update the mapping writeback index now + */ + update_nr_written(page, wbc, nr_written + 1); end = page_end; if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) @@ -2345,11 +2372,8 @@ done: unlock_extent(tree, unlock_start, page_end, GFP_NOFS); unlock_page(page); -update_nr_written: - wbc->nr_to_write -= nr_written; - if (wbc->range_cyclic || (wbc->nr_to_write > 0 && - wbc->range_start == 0 && wbc->range_end == LLONG_MAX)) - page->mapping->writeback_index = page->index + nr_written; +done_unlocked: + return 0; } -- cgit v1.1 From 8c594ea81d7abbbffdda447b127f8ba8d76f319d Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 20 Apr 2009 15:50:10 -0400 Subject: Btrfs: use the right node in reada_for_balance reada_for_balance was using the wrong index into the path node array, so it wasn't reading the right blocks. We never directly used the results of the read done by this function because the btree search is started over at the end. This fixes reada_for_balance to reada in the correct node and to avoid searching past the last slot in the node. It also makes sure to hold the parent lock while we are finding the nodes to read. Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index e5b2533..a99f1c2 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1325,12 +1325,12 @@ static noinline int reada_for_balance(struct btrfs_root *root, int ret = 0; int blocksize; - parent = path->nodes[level - 1]; + parent = path->nodes[level + 1]; if (!parent) return 0; nritems = btrfs_header_nritems(parent); - slot = path->slots[level]; + slot = path->slots[level + 1]; blocksize = btrfs_level_size(root, level); if (slot > 0) { @@ -1341,7 +1341,7 @@ static noinline int reada_for_balance(struct btrfs_root *root, block1 = 0; free_extent_buffer(eb); } - if (slot < nritems) { + if (slot + 1 < nritems) { block2 = btrfs_node_blockptr(parent, slot + 1); gen = btrfs_node_ptr_generation(parent, slot + 1); eb = btrfs_find_tree_block(root, block2, blocksize); @@ -1351,7 +1351,11 @@ static noinline int reada_for_balance(struct btrfs_root *root, } if (block1 || block2) { ret = -EAGAIN; + + /* release the whole path */ btrfs_release_path(root, path); + + /* read the blocks */ if (block1) readahead_tree_block(root, block1, blocksize, 0); if (block2) @@ -1361,7 +1365,7 @@ static noinline int reada_for_balance(struct btrfs_root *root, eb = read_tree_block(root, block1, blocksize, 0); free_extent_buffer(eb); } - if (block1) { + if (block2) { eb = read_tree_block(root, block2, blocksize, 0); free_extent_buffer(eb); } @@ -1481,12 +1485,15 @@ read_block_for_search(struct btrfs_trans_handle *trans, * of the btree by dropping locks before * we read. */ - btrfs_release_path(NULL, p); + btrfs_unlock_up_safe(p, level + 1); + btrfs_set_path_blocking(p); + if (tmp) free_extent_buffer(tmp); if (p->reada) reada_for_search(root, p, level, slot, key->objectid); + btrfs_release_path(NULL, p); tmp = read_tree_block(root, blocknr, blocksize, gen); if (tmp) free_extent_buffer(tmp); -- cgit v1.1 From cf2706a340ae98616d4e2a54900393e0e0b6b72c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 7 Apr 2009 09:03:30 -0400 Subject: Fix AUTOFS_DEV_IOCTL_REQUESTER_CMD Missing conversion from kernel to userland dev_t; this sucker breaks as soon as we get sufficiently many autofs mounts for new_encode_dev(s_dev) != s_dev. Note: this is the minimal fix. Signed-off-by: Al Viro --- fs/autofs4/dev-ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index 9e5ae8a..463f798 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -488,7 +488,7 @@ static int autofs_dev_ioctl_requester(struct file *fp, } path = param->path; - devid = sbi->sb->s_dev; + devid = new_encode_dev(sbi->sb->s_dev); param->requester.uid = param->requester.gid = -1; -- cgit v1.1 From e5d67f0715bc60f6c19abdd86d007d7bb29c9951 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 7 Apr 2009 12:15:39 -0400 Subject: Touch all affected namespaces on propagation of mount We shouldn't just touch the namespace of current process Caught-by: Trond Myklebust Signed-off-by: Al Viro --- fs/namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index d9138f8..4119620 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1377,7 +1377,7 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt, if (parent_path) { detach_mnt(source_mnt, parent_path); attach_mnt(source_mnt, path); - touch_mnt_namespace(current->nsproxy->mnt_ns); + touch_mnt_namespace(parent_path->mnt->mnt_ns); } else { mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt); commit_tree(source_mnt); -- cgit v1.1 From 1644ccc8a99ae73859c39372f96afdbf03c9f80d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 18 Apr 2009 02:32:31 -0400 Subject: Safer nfsd_cross_mnt() AFAICS, we have a subtle bug there: if we have crossed mountpoint *and* it got mount --move'd away, we'll be holding only one reference to fs containing dentry - exp->ex_path.mnt. IOW, we ought to dput() before exp_put(). Signed-off-by: Al Viro --- fs/nfsd/vfs.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index ab93fcf..46e6bd2 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -116,10 +116,15 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, } if ((exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2)) { /* successfully crossed mount point */ - exp_put(exp); - *expp = exp2; + /* + * This is subtle: dentry is *not* under mnt at this point. + * The only reason we are safe is that original mnt is pinned + * down by exp, so we should dput before putting exp. + */ dput(dentry); *dpp = mounts; + exp_put(exp); + *expp = exp2; } else { exp_put(exp2); dput(mounts); -- cgit v1.1 From 24b6f16ecf37f918a1934d590e9e71c100d6388f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 18 Apr 2009 03:25:41 -0400 Subject: No need for crossing to mountpoint in audit_tag_tree() is_under() will DTRT anyway. And yes, is_subdir() behaviour is intentional. Signed-off-by: Al Viro --- fs/dcache.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index 761d30b..1fcffeb 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2149,7 +2149,6 @@ int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry) int result; unsigned long seq; - /* FIXME: This is old behavior, needed? Please check callers. */ if (new_dentry == old_dentry) return 1; -- cgit v1.1 From 117aff744a20a2a04ccdb36cd5978316e1af0c3a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 18 Apr 2009 11:19:26 -0400 Subject: Fix autofs_expire() mnt should remain the same for all iterations through the list; as it is, if we have a busy mount, mnt follows into it and isn't restored for the next iteration. Signed-off-by: Al Viro --- fs/autofs/dirhash.c | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/autofs/dirhash.c b/fs/autofs/dirhash.c index bf8c8af..4eb4d8d 100644 --- a/fs/autofs/dirhash.c +++ b/fs/autofs/dirhash.c @@ -39,10 +39,12 @@ struct autofs_dir_ent *autofs_expire(struct super_block *sb, { struct autofs_dirhash *dh = &sbi->dirhash; struct autofs_dir_ent *ent; - struct dentry *dentry; unsigned long timeout = sbi->exp_timeout; while (1) { + struct path path; + int umount_ok; + if ( list_empty(&dh->expiry_head) || sbi->catatonic ) return NULL; /* No entries */ /* We keep the list sorted by last_usage and want old stuff */ @@ -57,17 +59,17 @@ struct autofs_dir_ent *autofs_expire(struct super_block *sb, return ent; /* Symlinks are always expirable */ /* Get the dentry for the autofs subdirectory */ - dentry = ent->dentry; + path.dentry = ent->dentry; - if ( !dentry ) { + if (!path.dentry) { /* Should only happen in catatonic mode */ printk("autofs: dentry == NULL but inode range is directory, entry %s\n", ent->name); autofs_delete_usage(ent); continue; } - if ( !dentry->d_inode ) { - dput(dentry); + if (!path.dentry->d_inode) { + dput(path.dentry); printk("autofs: negative dentry on expiry queue: %s\n", ent->name); autofs_delete_usage(ent); @@ -76,29 +78,29 @@ struct autofs_dir_ent *autofs_expire(struct super_block *sb, /* Make sure entry is mounted and unused; note that dentry will point to the mounted-on-top root. */ - if (!S_ISDIR(dentry->d_inode->i_mode)||!d_mountpoint(dentry)) { + if (!S_ISDIR(path.dentry->d_inode->i_mode) || + !d_mountpoint(path.dentry)) { DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name)); continue; } - mntget(mnt); - dget(dentry); - if (!follow_down(&mnt, &dentry)) { - dput(dentry); - mntput(mnt); + path.mnt = mnt; + path_get(&path); + if (!follow_down(&path.mnt, &path.dentry)) { + path_put(&path); DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name)); continue; } - while (d_mountpoint(dentry) && follow_down(&mnt, &dentry)) + while (d_mountpoint(path.dentry) && + follow_down(&path.mnt, &path.dentry)) ; - dput(dentry); + umount_ok = may_umount(path.mnt); + path_put(&path); - if ( may_umount(mnt) ) { - mntput(mnt); + if (umount_ok) { DPRINTK(("autofs: signaling expire on %s\n", ent->name)); return ent; /* Expirable! */ } DPRINTK(("autofs: didn't expire due to may_umount: %s\n", ent->name)); - mntput(mnt); } return NULL; /* No expirable entries */ } -- cgit v1.1 From 1ba0c7dbbbc24230394100c5f0d0df38cb400cff Mon Sep 17 00:00:00 2001 From: Alexander Beregalov Date: Mon, 20 Apr 2009 12:23:02 +0400 Subject: fs/compat_ioctl: fix build when !BLOCK In file included from fs/compat_ioctl.c:61: include/linux/loop.h:59: error: field 'lo_bio_list' has incomplete type Signed-off-by: Alexander Beregalov Signed-off-by: Al Viro --- fs/compat_ioctl.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 3e87ce4..b83f6bc 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -58,7 +58,6 @@ #include #include #include -#include #include #include @@ -68,6 +67,7 @@ #include #ifdef CONFIG_BLOCK +#include #include #include #include @@ -2660,6 +2660,8 @@ HANDLE_IOCTL(SONET_GETFRAMING, do_atm_ioctl) HANDLE_IOCTL(SONET_GETFRSENSE, do_atm_ioctl) /* block stuff */ #ifdef CONFIG_BLOCK +/* loop */ +IGNORE_IOCTL(LOOP_CLR_FD) /* Raw devices */ HANDLE_IOCTL(RAW_SETBIND, raw_ioctl) HANDLE_IOCTL(RAW_GETBIND, raw_ioctl) @@ -2728,9 +2730,6 @@ HANDLE_IOCTL(LPSETTIMEOUT, lp_timeout_trans) IGNORE_IOCTL(VFAT_IOCTL_READDIR_BOTH32) IGNORE_IOCTL(VFAT_IOCTL_READDIR_SHORT32) -/* loop */ -IGNORE_IOCTL(LOOP_CLR_FD) - #ifdef CONFIG_SPARC /* Sparc framebuffers, handled in sbusfb_compat_ioctl() */ IGNORE_IOCTL(FBIOGTYPE) -- cgit v1.1 From 2f9092e1020246168b1309b35e085ecd7ff9ff72 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Mon, 20 Apr 2009 23:18:37 +0100 Subject: Fix i_mutex vs. readdir handling in nfsd Commit 14f7dd63 ("Copy XFS readdir hack into nfsd code") introduced a bug to generic code which had been extant for a long time in the XFS version -- it started to call through into lookup_one_len() and hence into the file systems' ->lookup() methods without i_mutex held on the directory. This patch fixes it by locking the directory's i_mutex again before calling the filldir functions. The original deadlocks which commit 14f7dd63 was designed to avoid are still avoided, because they were due to fs-internal locking, not i_mutex. While we're at it, fix the return type of nfsd_buffered_readdir() which should be a __be32 not an int -- it's an NFS errno, not a Linux errno. And return nfserrno(-ENOMEM) when allocation fails, not just -ENOMEM. Sparse would have caught that, if it wasn't so busy bitching about __cold__. Commit 05f4f678 ("nfsd4: don't do lookup within readdir in recovery code") introduced a similar problem with calling lookup_one_len() without i_mutex, which this patch also addresses. To fix that, it was necessary to fix the called functions so that they expect i_mutex to be held; that part was done by J. Bruce Fields. Signed-off-by: David Woodhouse Umm-I-can-live-with-that-by: Al Viro Reported-by: J. R. Okajima Tested-by: J. Bruce Fields LKML-Reference: <8036.1237474444@jrobl> Cc: stable@kernel.org Signed-off-by: Al Viro --- fs/namei.c | 2 ++ fs/nfsd/nfs4recover.c | 46 +++++++++------------------------------------- fs/nfsd/vfs.c | 25 +++++++++++++++++++------ 3 files changed, 30 insertions(+), 43 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index b8433eb..78f253c 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1248,6 +1248,8 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) int err; struct qstr this; + WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex)); + err = __lookup_one_len(name, &this, base, len); if (err) return ERR_PTR(err); diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 3444c00..5275097 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -229,21 +229,23 @@ nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f) goto out; status = vfs_readdir(filp, nfsd4_build_namelist, &names); fput(filp); + mutex_lock(&dir->d_inode->i_mutex); while (!list_empty(&names)) { entry = list_entry(names.next, struct name_list, list); dentry = lookup_one_len(entry->name, dir, HEXDIR_LEN-1); if (IS_ERR(dentry)) { status = PTR_ERR(dentry); - goto out; + break; } status = f(dir, dentry); dput(dentry); if (status) - goto out; + break; list_del(&entry->list); kfree(entry); } + mutex_unlock(&dir->d_inode->i_mutex); out: while (!list_empty(&names)) { entry = list_entry(names.next, struct name_list, list); @@ -255,36 +257,6 @@ out: } static int -nfsd4_remove_clid_file(struct dentry *dir, struct dentry *dentry) -{ - int status; - - if (!S_ISREG(dir->d_inode->i_mode)) { - printk("nfsd4: non-file found in client recovery directory\n"); - return -EINVAL; - } - mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); - status = vfs_unlink(dir->d_inode, dentry); - mutex_unlock(&dir->d_inode->i_mutex); - return status; -} - -static int -nfsd4_clear_clid_dir(struct dentry *dir, struct dentry *dentry) -{ - int status; - - /* For now this directory should already be empty, but we empty it of - * any regular files anyway, just in case the directory was created by - * a kernel from the future.... */ - nfsd4_list_rec_dir(dentry, nfsd4_remove_clid_file); - mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); - status = vfs_rmdir(dir->d_inode, dentry); - mutex_unlock(&dir->d_inode->i_mutex); - return status; -} - -static int nfsd4_unlink_clid_dir(char *name, int namlen) { struct dentry *dentry; @@ -294,18 +266,18 @@ nfsd4_unlink_clid_dir(char *name, int namlen) mutex_lock(&rec_dir.dentry->d_inode->i_mutex); dentry = lookup_one_len(name, rec_dir.dentry, namlen); - mutex_unlock(&rec_dir.dentry->d_inode->i_mutex); if (IS_ERR(dentry)) { status = PTR_ERR(dentry); - return status; + goto out_unlock; } status = -ENOENT; if (!dentry->d_inode) goto out; - - status = nfsd4_clear_clid_dir(rec_dir.dentry, dentry); + status = vfs_rmdir(rec_dir.dentry->d_inode, dentry); out: dput(dentry); +out_unlock: + mutex_unlock(&rec_dir.dentry->d_inode->i_mutex); return status; } @@ -348,7 +320,7 @@ purge_old(struct dentry *parent, struct dentry *child) if (nfs4_has_reclaimed_state(child->d_name.name, false)) return 0; - status = nfsd4_clear_clid_dir(parent, child); + status = vfs_rmdir(parent->d_inode, child); if (status) printk("failed to remove client recovery directory %s\n", child->d_name.name); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 46e6bd2..6c68ffd 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1890,8 +1890,8 @@ static int nfsd_buffered_filldir(void *__buf, const char *name, int namlen, return 0; } -static int nfsd_buffered_readdir(struct file *file, filldir_t func, - struct readdir_cd *cdp, loff_t *offsetp) +static __be32 nfsd_buffered_readdir(struct file *file, filldir_t func, + struct readdir_cd *cdp, loff_t *offsetp) { struct readdir_data buf; struct buffered_dirent *de; @@ -1901,11 +1901,12 @@ static int nfsd_buffered_readdir(struct file *file, filldir_t func, buf.dirent = (void *)__get_free_page(GFP_KERNEL); if (!buf.dirent) - return -ENOMEM; + return nfserrno(-ENOMEM); offset = *offsetp; while (1) { + struct inode *dir_inode = file->f_path.dentry->d_inode; unsigned int reclen; cdp->err = nfserr_eof; /* will be cleared on successful read */ @@ -1924,26 +1925,38 @@ static int nfsd_buffered_readdir(struct file *file, filldir_t func, if (!size) break; + /* + * Various filldir functions may end up calling back into + * lookup_one_len() and the file system's ->lookup() method. + * These expect i_mutex to be held, as it would within readdir. + */ + host_err = mutex_lock_killable(&dir_inode->i_mutex); + if (host_err) + break; + de = (struct buffered_dirent *)buf.dirent; while (size > 0) { offset = de->offset; if (func(cdp, de->name, de->namlen, de->offset, de->ino, de->d_type)) - goto done; + break; if (cdp->err != nfs_ok) - goto done; + break; reclen = ALIGN(sizeof(*de) + de->namlen, sizeof(u64)); size -= reclen; de = (struct buffered_dirent *)((char *)de + reclen); } + mutex_unlock(&dir_inode->i_mutex); + if (size > 0) /* We bailed out early */ + break; + offset = vfs_llseek(file, 0, SEEK_CUR); } - done: free_page((unsigned long)(buf.dirent)); if (host_err) -- cgit v1.1 From 3eac8778a237d83a1e553eba0c6f4fd4b39eeec0 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 7 Apr 2009 11:12:46 -0400 Subject: autofs4: use memchr() in invalid_string() Signed-off-by: Al Viro --- fs/autofs4/dev-ioctl.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index 463f798..84168c0 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -54,11 +54,10 @@ static int check_name(const char *name) * Check a string doesn't overrun the chunk of * memory we copied from user land. */ -static int invalid_str(char *str, void *end) +static int invalid_str(char *str, size_t size) { - while ((void *) str <= end) - if (!*str++) - return 0; + if (memchr(str, 0, size)) + return 0; return -EINVAL; } @@ -138,8 +137,7 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param) } if (param->size > sizeof(*param)) { - err = invalid_str(param->path, - (void *) ((size_t) param + param->size)); + err = invalid_str(param->path, param->size - sizeof(*param)); if (err) { AUTOFS_WARN( "path string terminator missing for cmd(0x%08x)", -- cgit v1.1 From 3939fcde24473dc09ce16e922c88df9b3bee45d9 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 8 Apr 2009 15:06:12 +0800 Subject: xattr: use memdup_user() Remove open-coded memdup_user() Signed-off-by: Li Zefan Signed-off-by: Al Viro --- fs/xattr.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/xattr.c b/fs/xattr.c index 197c4fc..d51b8f9 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -237,13 +237,9 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value, if (size) { if (size > XATTR_SIZE_MAX) return -E2BIG; - kvalue = kmalloc(size, GFP_KERNEL); - if (!kvalue) - return -ENOMEM; - if (copy_from_user(kvalue, value, size)) { - kfree(kvalue); - return -EFAULT; - } + kvalue = memdup_user(value, size); + if (IS_ERR(kvalue)) + return PTR_ERR(kvalue); } error = vfs_setxattr(d, kname, kvalue, size, flags); -- cgit v1.1 From dae7b665cf6d6e6e733f1c9c16cf55547dd37e33 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 8 Apr 2009 15:06:54 +0800 Subject: btrfs: use memdup_user() Remove open-coded memdup_user(). Note this changes some GFP_NOFS to GFP_KERNEL, since copy_from_user() may cause pagefault, it's pointless to pass GFP_NOFS to kmalloc(). Signed-off-by: Li Zefan Signed-off-by: Al Viro --- fs/btrfs/ioctl.c | 49 ++++++++++++------------------------------------- fs/btrfs/super.c | 13 ++++--------- 2 files changed, 16 insertions(+), 46 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 7594bec..9f135e8 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -461,15 +461,9 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); - - if (!vol_args) - return -ENOMEM; - - if (copy_from_user(vol_args, arg, sizeof(*vol_args))) { - ret = -EFAULT; - goto out; - } + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; namelen = strlen(vol_args->name); @@ -545,7 +539,6 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg) out_unlock: mutex_unlock(&root->fs_info->volume_mutex); -out: kfree(vol_args); return ret; } @@ -565,15 +558,9 @@ static noinline int btrfs_ioctl_snap_create(struct file *file, if (root->fs_info->sb->s_flags & MS_RDONLY) return -EROFS; - vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); - - if (!vol_args) - return -ENOMEM; - - if (copy_from_user(vol_args, arg, sizeof(*vol_args))) { - ret = -EFAULT; - goto out; - } + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; namelen = strlen(vol_args->name); @@ -675,19 +662,13 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); - if (!vol_args) - return -ENOMEM; - - if (copy_from_user(vol_args, arg, sizeof(*vol_args))) { - ret = -EFAULT; - goto out; - } vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; ret = btrfs_init_new_device(root, vol_args->name); -out: kfree(vol_args); return ret; } @@ -703,19 +684,13 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) if (root->fs_info->sb->s_flags & MS_RDONLY) return -EROFS; - vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); - if (!vol_args) - return -ENOMEM; - - if (copy_from_user(vol_args, arg, sizeof(*vol_args))) { - ret = -EFAULT; - goto out; - } vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; ret = btrfs_rm_device(root, vol_args->name); -out: kfree(vol_args); return ret; } diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 9744af9..a7acfe6 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -635,14 +635,9 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - vol = kmalloc(sizeof(*vol), GFP_KERNEL); - if (!vol) - return -ENOMEM; - - if (copy_from_user(vol, (void __user *)arg, sizeof(*vol))) { - ret = -EFAULT; - goto out; - } + vol = memdup_user((void __user *)arg, sizeof(*vol)); + if (IS_ERR(vol)) + return PTR_ERR(vol); switch (cmd) { case BTRFS_IOC_SCAN_DEV: @@ -650,7 +645,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, &btrfs_fs_type, &fs_devices); break; } -out: + kfree(vol); return ret; } -- cgit v1.1 From 1c8542c7bb239ef02fe21477acd9cdac04c1b640 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 8 Apr 2009 15:07:30 +0800 Subject: sysfs: use memdup_user() Remove open-coded memdup_user(). Signed-off-by: Li Zefan Signed-off-by: Al Viro --- fs/sysfs/bin.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c index 93e0c02..9345806 100644 --- a/fs/sysfs/bin.c +++ b/fs/sysfs/bin.c @@ -157,14 +157,9 @@ static ssize_t write(struct file *file, const char __user *userbuf, count = size - offs; } - temp = kmalloc(count, GFP_KERNEL); - if (!temp) - return -ENOMEM; - - if (copy_from_user(temp, userbuf, count)) { - count = -EFAULT; - goto out_free; - } + temp = memdup_user(userbuf, count); + if (IS_ERR(temp)) + return PTR_ERR(temp); mutex_lock(&bb->mutex); @@ -176,8 +171,6 @@ static ssize_t write(struct file *file, const char __user *userbuf, if (count > 0) *off = offs + count; -out_free: - kfree(temp); return count; } -- cgit v1.1 From 0e639bdeef26faf287db77a15530f3f295a4ae04 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 8 Apr 2009 15:08:04 +0800 Subject: xfs: use memdup_user() Remove open-coded memdup_user() Signed-off-by: Li Zefan Signed-off-by: Al Viro --- fs/xfs/linux-2.6/xfs_ioctl.c | 23 +++++++---------------- fs/xfs/linux-2.6/xfs_ioctl32.c | 12 ++++-------- 2 files changed, 11 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index d0b4994..34eaab6 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -489,17 +489,12 @@ xfs_attrmulti_attr_set( if (len > XATTR_SIZE_MAX) return EINVAL; - kbuf = kmalloc(len, GFP_KERNEL); - if (!kbuf) - return ENOMEM; - - if (copy_from_user(kbuf, ubuf, len)) - goto out_kfree; + kbuf = memdup_user(ubuf, len); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); error = xfs_attr_set(XFS_I(inode), name, kbuf, len, flags); - out_kfree: - kfree(kbuf); return error; } @@ -540,20 +535,16 @@ xfs_attrmulti_by_handle( if (!size || size > 16 * PAGE_SIZE) goto out_dput; - error = ENOMEM; - ops = kmalloc(size, GFP_KERNEL); - if (!ops) + ops = memdup_user(am_hreq.ops, size); + if (IS_ERR(ops)) { + error = PTR_ERR(ops); goto out_dput; - - error = EFAULT; - if (copy_from_user(ops, am_hreq.ops, size)) - goto out_kfree_ops; + } attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL); if (!attr_name) goto out_kfree_ops; - error = 0; for (i = 0; i < am_hreq.opcount; i++) { ops[i].am_error = strncpy_from_user(attr_name, diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c index c70c4e3..0882d16 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl32.c +++ b/fs/xfs/linux-2.6/xfs_ioctl32.c @@ -427,20 +427,16 @@ xfs_compat_attrmulti_by_handle( if (!size || size > 16 * PAGE_SIZE) goto out_dput; - error = ENOMEM; - ops = kmalloc(size, GFP_KERNEL); - if (!ops) + ops = memdup_user(compat_ptr(am_hreq.ops), size); + if (IS_ERR(ops)) { + error = PTR_ERR(ops); goto out_dput; - - error = EFAULT; - if (copy_from_user(ops, compat_ptr(am_hreq.ops), size)) - goto out_kfree_ops; + } attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL); if (!attr_name) goto out_kfree_ops; - error = 0; for (i = 0; i < am_hreq.opcount; i++) { ops[i].am_error = strncpy_from_user(attr_name, -- cgit v1.1 From a9482ebcdedbc5872ed34a266e6a45c35116f264 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 8 Apr 2009 15:08:53 +0800 Subject: ncpfs: use memdup_user() Remove open-coded memdup_user() Signed-off-by: Li Zefan Signed-off-by: Al Viro --- fs/ncpfs/ioctl.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c index f54360f..fa038df 100644 --- a/fs/ncpfs/ioctl.c +++ b/fs/ncpfs/ioctl.c @@ -660,13 +660,10 @@ outrel: if (user.object_name_len > NCP_OBJECT_NAME_MAX_LEN) return -ENOMEM; if (user.object_name_len) { - newname = kmalloc(user.object_name_len, GFP_USER); - if (!newname) - return -ENOMEM; - if (copy_from_user(newname, user.object_name, user.object_name_len)) { - kfree(newname); - return -EFAULT; - } + newname = memdup_user(user.object_name, + user.object_name_len); + if (IS_ERR(newname)) + return PTR_ERR(newname); } else { newname = NULL; } @@ -760,13 +757,9 @@ outrel: if (user.len > NCP_PRIVATE_DATA_MAX_LEN) return -ENOMEM; if (user.len) { - new = kmalloc(user.len, GFP_USER); - if (!new) - return -ENOMEM; - if (copy_from_user(new, user.data, user.len)) { - kfree(new); - return -EFAULT; - } + new = memdup_user(user.data, user.len); + if (IS_ERR(new)) + return PTR_ERR(new); } else { new = NULL; } -- cgit v1.1 From fd56d242b3b80b6f2ca174272b20029aae61df75 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 8 Apr 2009 15:09:29 +0800 Subject: ecryptfs: use memdup_user() Remove open-coded memdup_user(). Signed-off-by: Li Zefan Signed-off-by: Al Viro --- fs/ecryptfs/miscdev.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c index a67fea6..dda3c58 100644 --- a/fs/ecryptfs/miscdev.c +++ b/fs/ecryptfs/miscdev.c @@ -418,18 +418,13 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf, if (count == 0) goto out; - data = kmalloc(count, GFP_KERNEL); - if (!data) { - printk(KERN_ERR "%s: Out of memory whilst attempting to " - "kmalloc([%zd], GFP_KERNEL)\n", __func__, count); + + data = memdup_user(buf, count); + if (IS_ERR(data)) { + printk(KERN_ERR "%s: memdup_user returned error [%ld]\n", + __func__, PTR_ERR(data)); goto out; } - rc = copy_from_user(data, buf, count); - if (rc) { - printk(KERN_ERR "%s: copy_from_user returned error [%d]\n", - __func__, rc); - goto out_free; - } sz = count; i = 0; switch (data[i++]) { -- cgit v1.1 From 0112fc2229847feb6c4eb011e6833d8f1742a375 Mon Sep 17 00:00:00 2001 From: Oleg Drokin Date: Wed, 8 Apr 2009 20:05:42 +0400 Subject: Separate out common fstatat code into vfs_fstatat This is a version incorporating Christoph's suggestion. Separate out common *fstatat functionality into a single function instead of duplicating it all over the code. Signed-off-by: Oleg Drokin Signed-off-by: Al Viro --- fs/compat.c | 19 +++++-------------- fs/stat.c | 56 ++++++++++++++++++++++++++++---------------------------- 2 files changed, 33 insertions(+), 42 deletions(-) (limited to 'fs') diff --git a/fs/compat.c b/fs/compat.c index 3f84d5f..dda72e2 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -204,21 +204,12 @@ asmlinkage long compat_sys_newfstatat(unsigned int dfd, char __user *filename, struct compat_stat __user *statbuf, int flag) { struct kstat stat; - int error = -EINVAL; - - if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0) - goto out; - - if (flag & AT_SYMLINK_NOFOLLOW) - error = vfs_lstat_fd(dfd, filename, &stat); - else - error = vfs_stat_fd(dfd, filename, &stat); - - if (!error) - error = cp_compat_stat(&stat, statbuf); + int error; -out: - return error; + error = vfs_fstatat(dfd, filename, &stat, flag); + if (error) + return error; + return cp_compat_stat(&stat, statbuf); } #endif diff --git a/fs/stat.c b/fs/stat.c index 2db740a..5471166 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -109,6 +109,24 @@ int vfs_fstat(unsigned int fd, struct kstat *stat) EXPORT_SYMBOL(vfs_fstat); +int vfs_fstatat(int dfd, char __user *filename, struct kstat *stat, int flag) +{ + int error = -EINVAL; + + if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0) + goto out; + + if (flag & AT_SYMLINK_NOFOLLOW) + error = vfs_lstat_fd(dfd, filename, stat); + else + error = vfs_stat_fd(dfd, filename, stat); +out: + return error; +} + +EXPORT_SYMBOL(vfs_fstatat); + + #ifdef __ARCH_WANT_OLD_STAT /* @@ -264,21 +282,12 @@ SYSCALL_DEFINE4(newfstatat, int, dfd, char __user *, filename, struct stat __user *, statbuf, int, flag) { struct kstat stat; - int error = -EINVAL; - - if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0) - goto out; - - if (flag & AT_SYMLINK_NOFOLLOW) - error = vfs_lstat_fd(dfd, filename, &stat); - else - error = vfs_stat_fd(dfd, filename, &stat); - - if (!error) - error = cp_new_stat(&stat, statbuf); + int error; -out: - return error; + error = vfs_fstatat(dfd, filename, &stat, flag); + if (error) + return error; + return cp_new_stat(&stat, statbuf); } #endif @@ -404,21 +413,12 @@ SYSCALL_DEFINE4(fstatat64, int, dfd, char __user *, filename, struct stat64 __user *, statbuf, int, flag) { struct kstat stat; - int error = -EINVAL; - - if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0) - goto out; - - if (flag & AT_SYMLINK_NOFOLLOW) - error = vfs_lstat_fd(dfd, filename, &stat); - else - error = vfs_stat_fd(dfd, filename, &stat); - - if (!error) - error = cp_new_stat64(&stat, statbuf); + int error; -out: - return error; + error = vfs_fstatat(dfd, filename, &stat, flag); + if (error) + return error; + return cp_new_stat64(&stat, statbuf); } #endif /* __ARCH_WANT_STAT64 */ -- cgit v1.1 From 2eae7a1874ca5be3232765d89e0250a449f1bc90 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 8 Apr 2009 16:34:03 -0400 Subject: kill vfs_stat_fd / vfs_lstat_fd There's really no reason to keep vfs_stat_fd and vfs_lstat_fd with Oleg's vfs_fstatat. Use vfs_fstatat for the few cases having the directory fd, and switch all others to vfs_stat / vfs_lstat. Reviewed-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/compat.c | 18 ++++++----- fs/stat.c | 105 ++++++++++++++++++++++++------------------------------------ 2 files changed, 52 insertions(+), 71 deletions(-) (limited to 'fs') diff --git a/fs/compat.c b/fs/compat.c index dda72e2..379a399 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -181,22 +181,24 @@ asmlinkage long compat_sys_newstat(char __user * filename, struct compat_stat __user *statbuf) { struct kstat stat; - int error = vfs_stat_fd(AT_FDCWD, filename, &stat); + int error; - if (!error) - error = cp_compat_stat(&stat, statbuf); - return error; + error = vfs_stat(filename, &stat); + if (error) + return error; + return cp_compat_stat(&stat, statbuf); } asmlinkage long compat_sys_newlstat(char __user * filename, struct compat_stat __user *statbuf) { struct kstat stat; - int error = vfs_lstat_fd(AT_FDCWD, filename, &stat); + int error; - if (!error) - error = cp_compat_stat(&stat, statbuf); - return error; + error = vfs_lstat(filename, &stat); + if (error) + return error; + return cp_compat_stat(&stat, statbuf); } #ifndef __ARCH_WANT_STAT64 diff --git a/fs/stat.c b/fs/stat.c index 5471166..075694e 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -55,46 +55,6 @@ int vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) EXPORT_SYMBOL(vfs_getattr); -int vfs_stat_fd(int dfd, char __user *name, struct kstat *stat) -{ - struct path path; - int error; - - error = user_path_at(dfd, name, LOOKUP_FOLLOW, &path); - if (!error) { - error = vfs_getattr(path.mnt, path.dentry, stat); - path_put(&path); - } - return error; -} - -int vfs_stat(char __user *name, struct kstat *stat) -{ - return vfs_stat_fd(AT_FDCWD, name, stat); -} - -EXPORT_SYMBOL(vfs_stat); - -int vfs_lstat_fd(int dfd, char __user *name, struct kstat *stat) -{ - struct path path; - int error; - - error = user_path_at(dfd, name, 0, &path); - if (!error) { - error = vfs_getattr(path.mnt, path.dentry, stat); - path_put(&path); - } - return error; -} - -int vfs_lstat(char __user *name, struct kstat *stat) -{ - return vfs_lstat_fd(AT_FDCWD, name, stat); -} - -EXPORT_SYMBOL(vfs_lstat); - int vfs_fstat(unsigned int fd, struct kstat *stat) { struct file *f = fget(fd); @@ -106,26 +66,43 @@ int vfs_fstat(unsigned int fd, struct kstat *stat) } return error; } - EXPORT_SYMBOL(vfs_fstat); int vfs_fstatat(int dfd, char __user *filename, struct kstat *stat, int flag) { + struct path path; int error = -EINVAL; + int lookup_flags = 0; if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0) goto out; - if (flag & AT_SYMLINK_NOFOLLOW) - error = vfs_lstat_fd(dfd, filename, stat); - else - error = vfs_stat_fd(dfd, filename, stat); + if (!(flag & AT_SYMLINK_NOFOLLOW)) + lookup_flags |= LOOKUP_FOLLOW; + + error = user_path_at(dfd, filename, lookup_flags, &path); + if (error) + goto out; + + error = vfs_getattr(path.mnt, path.dentry, stat); + path_put(&path); out: return error; } - EXPORT_SYMBOL(vfs_fstatat); +int vfs_stat(char __user *name, struct kstat *stat) +{ + return vfs_fstatat(AT_FDCWD, name, stat, 0); +} +EXPORT_SYMBOL(vfs_stat); + +int vfs_lstat(char __user *name, struct kstat *stat) +{ + return vfs_fstatat(AT_FDCWD, name, stat, AT_SYMLINK_NOFOLLOW); +} +EXPORT_SYMBOL(vfs_lstat); + #ifdef __ARCH_WANT_OLD_STAT @@ -173,23 +150,25 @@ static int cp_old_stat(struct kstat *stat, struct __old_kernel_stat __user * sta SYSCALL_DEFINE2(stat, char __user *, filename, struct __old_kernel_stat __user *, statbuf) { struct kstat stat; - int error = vfs_stat_fd(AT_FDCWD, filename, &stat); + int error; - if (!error) - error = cp_old_stat(&stat, statbuf); + error = vfs_stat(filename, &stat); + if (error) + return error; - return error; + return cp_old_stat(&stat, statbuf); } SYSCALL_DEFINE2(lstat, char __user *, filename, struct __old_kernel_stat __user *, statbuf) { struct kstat stat; - int error = vfs_lstat_fd(AT_FDCWD, filename, &stat); + int error; - if (!error) - error = cp_old_stat(&stat, statbuf); + error = vfs_lstat(filename, &stat); + if (error) + return error; - return error; + return cp_old_stat(&stat, statbuf); } SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, statbuf) @@ -258,23 +237,23 @@ static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf) SYSCALL_DEFINE2(newstat, char __user *, filename, struct stat __user *, statbuf) { struct kstat stat; - int error = vfs_stat_fd(AT_FDCWD, filename, &stat); - - if (!error) - error = cp_new_stat(&stat, statbuf); + int error = vfs_stat(filename, &stat); - return error; + if (error) + return error; + return cp_new_stat(&stat, statbuf); } SYSCALL_DEFINE2(newlstat, char __user *, filename, struct stat __user *, statbuf) { struct kstat stat; - int error = vfs_lstat_fd(AT_FDCWD, filename, &stat); + int error; - if (!error) - error = cp_new_stat(&stat, statbuf); + error = vfs_lstat(filename, &stat); + if (error) + return error; - return error; + return cp_new_stat(&stat, statbuf); } #if !defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_SYS_NEWFSTATAT) -- cgit v1.1 From 38e23c95f92a84fb8505a9f572b8a209c9c372c1 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Thu, 9 Apr 2009 20:17:52 +0900 Subject: fs: Mark get_filesystem_list() as __init function. "int get_filesystem_list(char * buf)" is called by only "static void __init get_fs_names(char *page)". We can mark get_filesystem_list() as "__init". Signed-off-by: Tetsuo Handa Signed-off-by: Al Viro --- fs/filesystems.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/filesystems.c b/fs/filesystems.c index 1aa7026..a24c58e 100644 --- a/fs/filesystems.c +++ b/fs/filesystems.c @@ -199,7 +199,7 @@ SYSCALL_DEFINE3(sysfs, int, option, unsigned long, arg1, unsigned long, arg2) return retval; } -int get_filesystem_list(char * buf) +int __init get_filesystem_list(char *buf) { int len = 0; struct file_system_type * tmp; -- cgit v1.1 From 8340437210390676f687633a80e3748c40885dc8 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 20 Apr 2009 14:58:35 -0400 Subject: NFS: Fix the XDR iovec calculation in nfs3_xdr_setaclargs Commit ae46141ff08f1965b17c531b571953c39ce8b9e2 (NFSv3: Fix posix ACL code) introduces a bug in the calculation of the XDR header iovec. In the case where we are inlining the acls, we need to adjust the length of the iovec req->rq_svec, in addition to adjusting the total buffer length. Tested-by: Leonardo Chiquitto Tested-by: Suresh Jayaraman Signed-off-by: Trond Myklebust Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- fs/nfs/nfs3xdr.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index e6a1932..35869a4 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -713,7 +713,8 @@ nfs3_xdr_setaclargs(struct rpc_rqst *req, __be32 *p, if (args->npages != 0) xdr_encode_pages(buf, args->pages, 0, args->len); else - req->rq_slen += args->len; + req->rq_slen = xdr_adjust_iovec(req->rq_svec, + p + XDR_QUADLEN(args->len)); err = nfsacl_encode(buf, base, args->inode, (args->mask & NFS_ACL) ? -- cgit v1.1 From 546888da82082555a56528730a83f0afd12f33bf Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 21 Apr 2009 11:53:38 -0400 Subject: Btrfs: fix btrfs fallocate oops and deadlock Btrfs fallocate was incorrectly starting a transaction with a lock held on the extent_io tree for the file, which could deadlock. Strictly speaking it was using join_transaction which would be safe, but it is better to move the transaction outside of the lock. When preallocated extents are overwritten, btrfs_mark_buffer_dirty was being called on an unlocked buffer. This was triggering an assertion and oops because the lock is supposed to be held. The bug was calling btrfs_mark_buffer_dirty on a leaf after btrfs_del_item had been run. btrfs_del_item takes care of dirtying things, so the solution is a to skip the btrfs_mark_buffer_dirty call in this case. Signed-off-by: Chris Mason --- fs/btrfs/file.c | 4 +++- fs/btrfs/inode.c | 36 ++++++++++++++++++++++++++++-------- 2 files changed, 31 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index e21c006..482f8db 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -830,7 +830,7 @@ again: ret = btrfs_del_items(trans, root, path, del_slot, del_nr); BUG_ON(ret); - goto done; + goto release; } else if (split == start) { if (locked_end < extent_end) { ret = try_lock_extent(&BTRFS_I(inode)->io_tree, @@ -926,6 +926,8 @@ again: } done: btrfs_mark_buffer_dirty(leaf); + +release: btrfs_release_path(root, path); if (split_end && split == start) { split = end; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a0d1dd4..65219f6 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4970,10 +4970,10 @@ out_fail: return err; } -static int prealloc_file_range(struct inode *inode, u64 start, u64 end, +static int prealloc_file_range(struct btrfs_trans_handle *trans, + struct inode *inode, u64 start, u64 end, u64 alloc_hint, int mode) { - struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key ins; u64 alloc_size; @@ -4981,10 +4981,6 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end, u64 num_bytes = end - start; int ret = 0; - trans = btrfs_join_transaction(root, 1); - BUG_ON(!trans); - btrfs_set_trans_block_group(trans, inode); - while (num_bytes > 0) { alloc_size = min(num_bytes, root->fs_info->max_extent); ret = btrfs_reserve_extent(trans, root, alloc_size, @@ -5015,7 +5011,6 @@ out: BUG_ON(ret); } - btrfs_end_transaction(trans, root); return ret; } @@ -5029,11 +5024,18 @@ static long btrfs_fallocate(struct inode *inode, int mode, u64 alloc_hint = 0; u64 mask = BTRFS_I(inode)->root->sectorsize - 1; struct extent_map *em; + struct btrfs_trans_handle *trans; int ret; alloc_start = offset & ~mask; alloc_end = (offset + len + mask) & ~mask; + /* + * wait for ordered IO before we have any locks. We'll loop again + * below with the locks held. + */ + btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start); + mutex_lock(&inode->i_mutex); if (alloc_start > inode->i_size) { ret = btrfs_cont_expand(inode, alloc_start); @@ -5043,6 +5045,16 @@ static long btrfs_fallocate(struct inode *inode, int mode, while (1) { struct btrfs_ordered_extent *ordered; + + trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1); + if (!trans) { + ret = -EIO; + goto out; + } + + /* the extent lock is ordered inside the running + * transaction + */ lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, alloc_end - 1, GFP_NOFS); ordered = btrfs_lookup_first_ordered_extent(inode, @@ -5053,6 +5065,12 @@ static long btrfs_fallocate(struct inode *inode, int mode, btrfs_put_ordered_extent(ordered); unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, alloc_end - 1, GFP_NOFS); + btrfs_end_transaction(trans, BTRFS_I(inode)->root); + + /* + * we can't wait on the range with the transaction + * running or with the extent lock held + */ btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start); } else { @@ -5070,7 +5088,7 @@ static long btrfs_fallocate(struct inode *inode, int mode, last_byte = min(extent_map_end(em), alloc_end); last_byte = (last_byte + mask) & ~mask; if (em->block_start == EXTENT_MAP_HOLE) { - ret = prealloc_file_range(inode, cur_offset, + ret = prealloc_file_range(trans, inode, cur_offset, last_byte, alloc_hint, mode); if (ret < 0) { free_extent_map(em); @@ -5089,6 +5107,8 @@ static long btrfs_fallocate(struct inode *inode, int mode, } unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, alloc_end - 1, GFP_NOFS); + + btrfs_end_transaction(trans, BTRFS_I(inode)->root); out: mutex_unlock(&inode->i_mutex); return ret; -- cgit v1.1 From c12ddba09394c60e1120e6997794fa6ed52da884 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Tue, 21 Apr 2009 12:24:05 -0700 Subject: hugetlbfs: return negative error code for bad mount option This fixes the following BUG: # mount -o size=MM -t hugetlbfs none /huge hugetlbfs: Bad value 'MM' for mount option 'size=MM' ------------[ cut here ]------------ kernel BUG at fs/super.c:996! Due to BUG_ON(!mnt->mnt_sb); in vfs_kern_mount(). Also, remove unused #include Cc: William Irwin Cc: Signed-off-by: Akinobu Mita Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 23a3c76..153d968 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include @@ -842,7 +841,7 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) bad_val: printk(KERN_ERR "hugetlbfs: Bad value '%s' for mount option '%s'\n", args[0].from, p); - return 1; + return -EINVAL; } static int -- cgit v1.1 From 451a9ebf653d28337ba53ed5b4b70b0b9543cca1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 15 Apr 2009 19:50:51 +0200 Subject: bio: fix bio_kmalloc() Impact: fix bio_kmalloc() and its destruction path bio_kmalloc() was broken in two ways. * bvec_alloc_bs() first allocates bvec using kmalloc() and then ignores it and allocates again like non-kmalloc bvecs. * bio_kmalloc_destructor() didn't check for and free bio integrity data. This patch fixes the above problems. kmalloc patch is separated out from bio_alloc_bioset() and allocates the requested number of bvecs as inline bvecs. * bio_alloc_bioset() no longer takes NULL @bs. None other than bio_kmalloc() used it and outside users can't know how it was allocated anyway. * Define and use BIO_POOL_NONE so that pool index check in bvec_free_bs() triggers if inline or kmalloc allocated bvec gets there. * Relocate destructors on top of each allocation function so that how they're used is more clear. Jens Axboe suggested allocating bvecs inline. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- fs/bio.c | 118 +++++++++++++++++++++++++++++---------------------------------- 1 file changed, 54 insertions(+), 64 deletions(-) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index cd42bb8..d35588f 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -175,14 +175,6 @@ struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_vec *bvl; /* - * If 'bs' is given, lookup the pool and do the mempool alloc. - * If not, this is a bio_kmalloc() allocation and just do a - * kzalloc() for the exact number of vecs right away. - */ - if (!bs) - bvl = kmalloc(nr * sizeof(struct bio_vec), gfp_mask); - - /* * see comment near bvec_array define! */ switch (nr) { @@ -260,21 +252,6 @@ void bio_free(struct bio *bio, struct bio_set *bs) mempool_free(p, bs->bio_pool); } -/* - * default destructor for a bio allocated with bio_alloc_bioset() - */ -static void bio_fs_destructor(struct bio *bio) -{ - bio_free(bio, fs_bio_set); -} - -static void bio_kmalloc_destructor(struct bio *bio) -{ - if (bio_has_allocated_vec(bio)) - kfree(bio->bi_io_vec); - kfree(bio); -} - void bio_init(struct bio *bio) { memset(bio, 0, sizeof(*bio)); @@ -301,21 +278,15 @@ void bio_init(struct bio *bio) **/ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) { + unsigned long idx = BIO_POOL_NONE; struct bio_vec *bvl = NULL; - struct bio *bio = NULL; - unsigned long idx = 0; - void *p = NULL; - - if (bs) { - p = mempool_alloc(bs->bio_pool, gfp_mask); - if (!p) - goto err; - bio = p + bs->front_pad; - } else { - bio = kmalloc(sizeof(*bio), gfp_mask); - if (!bio) - goto err; - } + struct bio *bio; + void *p; + + p = mempool_alloc(bs->bio_pool, gfp_mask); + if (unlikely(!p)) + return NULL; + bio = p + bs->front_pad; bio_init(bio); @@ -332,22 +303,50 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) nr_iovecs = bvec_nr_vecs(idx); } +out_set: bio->bi_flags |= idx << BIO_POOL_OFFSET; bio->bi_max_vecs = nr_iovecs; -out_set: bio->bi_io_vec = bvl; - return bio; err_free: - if (bs) - mempool_free(p, bs->bio_pool); - else - kfree(bio); -err: + mempool_free(p, bs->bio_pool); return NULL; } +static void bio_fs_destructor(struct bio *bio) +{ + bio_free(bio, fs_bio_set); +} + +/** + * bio_alloc - allocate a new bio, memory pool backed + * @gfp_mask: allocation mask to use + * @nr_iovecs: number of iovecs + * + * Allocate a new bio with @nr_iovecs bvecs. If @gfp_mask + * contains __GFP_WAIT, the allocation is guaranteed to succeed. + * + * RETURNS: + * Pointer to new bio on success, NULL on failure. + */ +struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs) +{ + struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set); + + if (bio) + bio->bi_destructor = bio_fs_destructor; + + return bio; +} + +static void bio_kmalloc_destructor(struct bio *bio) +{ + if (bio_integrity(bio)) + bio_integrity_free(bio); + kfree(bio); +} + /** * bio_alloc - allocate a bio for I/O * @gfp_mask: the GFP_ mask given to the slab allocator @@ -366,29 +365,20 @@ err: * do so can cause livelocks under memory pressure. * **/ -struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs) -{ - struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set); - - if (bio) - bio->bi_destructor = bio_fs_destructor; - - return bio; -} - -/* - * Like bio_alloc(), but doesn't use a mempool backing. This means that - * it CAN fail, but while bio_alloc() can only be used for allocations - * that have a short (finite) life span, bio_kmalloc() should be used - * for more permanent bio allocations (like allocating some bio's for - * initalization or setup purposes). - */ struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs) { - struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, NULL); + struct bio *bio; - if (bio) - bio->bi_destructor = bio_kmalloc_destructor; + bio = kmalloc(sizeof(struct bio) + nr_iovecs * sizeof(struct bio_vec), + gfp_mask); + if (unlikely(!bio)) + return NULL; + + bio_init(bio); + bio->bi_flags |= BIO_POOL_NONE << BIO_POOL_OFFSET; + bio->bi_max_vecs = nr_iovecs; + bio->bi_io_vec = bio->bi_inline_vecs; + bio->bi_destructor = bio_kmalloc_destructor; return bio; } -- cgit v1.1 From a9e9dc24bbc3e084450a22cf4fb82f5f5d4cbeea Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 15 Apr 2009 22:10:27 +0900 Subject: bio: use bio_kmalloc() in copy/map functions Impact: remove possible deadlock condition There is no reason to use mempool backed allocation for map functions. Also, because kern mapping is used inside LLDs (e.g. for EH), using mempool backed allocation can lead to deadlock under extreme conditions (mempool already consumed by the time a request reached EH and requests are blocked on EH). Switch copy/map functions to bio_kmalloc(). Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- fs/bio.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index d35588f..7bbc98f 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -822,7 +822,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q, return ERR_PTR(-ENOMEM); ret = -ENOMEM; - bio = bio_alloc(gfp_mask, nr_pages); + bio = bio_kmalloc(gfp_mask, nr_pages); if (!bio) goto out_bmd; @@ -946,7 +946,7 @@ static struct bio *__bio_map_user_iov(struct request_queue *q, if (!nr_pages) return ERR_PTR(-EINVAL); - bio = bio_alloc(gfp_mask, nr_pages); + bio = bio_kmalloc(gfp_mask, nr_pages); if (!bio) return ERR_PTR(-ENOMEM); @@ -1130,7 +1130,7 @@ static struct bio *__bio_map_kern(struct request_queue *q, void *data, int offset, i; struct bio *bio; - bio = bio_alloc(gfp_mask, nr_pages); + bio = bio_kmalloc(gfp_mask, nr_pages); if (!bio) return ERR_PTR(-ENOMEM); -- cgit v1.1 From ae6e84596e7b321d9a08e81679c6a3f799634636 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Thu, 12 Mar 2009 00:19:46 -0500 Subject: eCryptfs: Copy lower inode attrs before dentry instantiation Copies the lower inode attributes to the upper inode before passing the upper inode to d_instantiate(). This is important for security_d_instantiate(). The problem was discovered by a user seeing SELinux denials like so: type=AVC msg=audit(1236812817.898:47): avc: denied { 0x100000 } for pid=3584 comm="httpd" name="testdir" dev=ecryptfs ino=943872 scontext=root:system_r:httpd_t:s0 tcontext=root:object_r:httpd_sys_content_t:s0 tclass=file Notice target class is file while testdir is really a directory, confusing the permission translation (0x100000) due to the wrong i_mode. Signed-off-by: Tyler Hicks --- fs/ecryptfs/main.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index aed56c2..7638b0a 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -190,14 +190,14 @@ int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry, init_special_inode(inode, lower_inode->i_mode, lower_inode->i_rdev); dentry->d_op = &ecryptfs_dops; - if (flags & ECRYPTFS_INTERPOSE_FLAG_D_ADD) - d_add(dentry, inode); - else - d_instantiate(dentry, inode); fsstack_copy_attr_all(inode, lower_inode, NULL); /* This size will be overwritten for real files w/ headers and * other metadata */ fsstack_copy_inode_size(inode, lower_inode); + if (flags & ECRYPTFS_INTERPOSE_FLAG_D_ADD) + d_add(dentry, inode); + else + d_instantiate(dentry, inode); out: return rc; } -- cgit v1.1 From 57ea34d19963781d05eb12f9b31bd4f70d61ec16 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Sun, 15 Mar 2009 14:17:01 -0500 Subject: eCryptfs: NULL pointer dereference in ecryptfs_send_miscdev() If data is NULL, msg_ctx->msg is set to NULL and then dereferenced afterwards. ecryptfs_send_raw_message() is the only place that ecryptfs_send_miscdev() is called with data being NULL, but the only caller of that function (ecryptfs_process_helo()) is never called. In short, there is currently no way to trigger the NULL pointer dereference. This patch removes the two unused functions and modifies ecryptfs_send_miscdev() to remove the NULL dereferences. Signed-off-by: Tyler Hicks --- fs/ecryptfs/messaging.c | 82 ------------------------------------------------- fs/ecryptfs/miscdev.c | 28 +++++++---------- 2 files changed, 11 insertions(+), 99 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c index 295e7fa5..f1c17e8 100644 --- a/fs/ecryptfs/messaging.c +++ b/fs/ecryptfs/messaging.c @@ -133,45 +133,6 @@ out: return rc; } -static int -ecryptfs_send_message_locked(char *data, int data_len, u8 msg_type, - struct ecryptfs_msg_ctx **msg_ctx); - -/** - * ecryptfs_send_raw_message - * @msg_type: Message type - * @daemon: Daemon struct for recipient of message - * - * A raw message is one that does not include an ecryptfs_message - * struct. It simply has a type. - * - * Must be called with ecryptfs_daemon_hash_mux held. - * - * Returns zero on success; non-zero otherwise - */ -static int ecryptfs_send_raw_message(u8 msg_type, - struct ecryptfs_daemon *daemon) -{ - struct ecryptfs_msg_ctx *msg_ctx; - int rc; - - rc = ecryptfs_send_message_locked(NULL, 0, msg_type, &msg_ctx); - if (rc) { - printk(KERN_ERR "%s: Error whilst attempting to send " - "message to ecryptfsd; rc = [%d]\n", __func__, rc); - goto out; - } - /* Raw messages are logically context-free (e.g., no - * reply is expected), so we set the state of the - * ecryptfs_msg_ctx object to indicate that it should - * be freed as soon as the message is sent. */ - mutex_lock(&msg_ctx->mux); - msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_NO_REPLY; - mutex_unlock(&msg_ctx->mux); -out: - return rc; -} - /** * ecryptfs_spawn_daemon - Create and initialize a new daemon struct * @daemon: Pointer to set to newly allocated daemon struct @@ -212,49 +173,6 @@ out: } /** - * ecryptfs_process_helo - * @euid: The user ID owner of the message - * @user_ns: The namespace in which @euid applies - * @pid: The process ID for the userspace program that sent the - * message - * - * Adds the euid and pid values to the daemon euid hash. If an euid - * already has a daemon pid registered, the daemon will be - * unregistered before the new daemon is put into the hash list. - * Returns zero after adding a new daemon to the hash list; - * non-zero otherwise. - */ -int ecryptfs_process_helo(uid_t euid, struct user_namespace *user_ns, - struct pid *pid) -{ - struct ecryptfs_daemon *new_daemon; - struct ecryptfs_daemon *old_daemon; - int rc; - - mutex_lock(&ecryptfs_daemon_hash_mux); - rc = ecryptfs_find_daemon_by_euid(&old_daemon, euid, user_ns); - if (rc != 0) { - printk(KERN_WARNING "Received request from user [%d] " - "to register daemon [0x%p]; unregistering daemon " - "[0x%p]\n", euid, pid, old_daemon->pid); - rc = ecryptfs_send_raw_message(ECRYPTFS_MSG_QUIT, old_daemon); - if (rc) - printk(KERN_WARNING "Failed to send QUIT " - "message to daemon [0x%p]; rc = [%d]\n", - old_daemon->pid, rc); - hlist_del(&old_daemon->euid_chain); - kfree(old_daemon); - } - rc = ecryptfs_spawn_daemon(&new_daemon, euid, user_ns, pid); - if (rc) - printk(KERN_ERR "%s: The gods are displeased with this attempt " - "to create a new daemon object for euid [%d]; pid " - "[0x%p]; rc = [%d]\n", __func__, euid, pid, rc); - mutex_unlock(&ecryptfs_daemon_hash_mux); - return rc; -} - -/** * ecryptfs_exorcise_daemon - Destroy the daemon struct * * Must be called ceremoniously while in possession of diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c index dda3c58..4ec8f61 100644 --- a/fs/ecryptfs/miscdev.c +++ b/fs/ecryptfs/miscdev.c @@ -193,26 +193,20 @@ int ecryptfs_send_miscdev(char *data, size_t data_size, int rc = 0; mutex_lock(&msg_ctx->mux); - if (data) { - msg_ctx->msg = kmalloc((sizeof(*msg_ctx->msg) + data_size), - GFP_KERNEL); - if (!msg_ctx->msg) { - rc = -ENOMEM; - printk(KERN_ERR "%s: Out of memory whilst attempting " - "to kmalloc(%zd, GFP_KERNEL)\n", __func__, - (sizeof(*msg_ctx->msg) + data_size)); - goto out_unlock; - } - } else - msg_ctx->msg = NULL; + msg_ctx->msg = kmalloc((sizeof(*msg_ctx->msg) + data_size), + GFP_KERNEL); + if (!msg_ctx->msg) { + rc = -ENOMEM; + printk(KERN_ERR "%s: Out of memory whilst attempting " + "to kmalloc(%zd, GFP_KERNEL)\n", __func__, + (sizeof(*msg_ctx->msg) + data_size)); + goto out_unlock; + } msg_ctx->msg->index = msg_ctx->index; msg_ctx->msg->data_len = data_size; msg_ctx->type = msg_type; - if (data) { - memcpy(msg_ctx->msg->data, data, data_size); - msg_ctx->msg_size = (sizeof(*msg_ctx->msg) + data_size); - } else - msg_ctx->msg_size = 0; + memcpy(msg_ctx->msg->data, data, data_size); + msg_ctx->msg_size = (sizeof(*msg_ctx->msg) + data_size); mutex_lock(&daemon->mux); list_add_tail(&msg_ctx->daemon_out_list, &daemon->msg_ctx_out_queue); daemon->num_queued_msg_ctx++; -- cgit v1.1 From 3a5203ab3c0c31e0f1434c69e893bfb85c6e6657 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Mon, 16 Mar 2009 12:35:12 -0500 Subject: eCryptfs: Print FNEK sig properly in /proc/mounts The filename encryption key signature is not properly displayed in /proc/mounts. The "ecryptfs_sig=" mount option name is displayed for all global authentication tokens, included those for filename keys. This patch checks the global authentication token flags to determine if the key is a FEKEK or FNEK and prints the appropriate mount option name before the signature. Signed-off-by: Tyler Hicks --- fs/ecryptfs/super.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c index c27ac2b..b500302 100644 --- a/fs/ecryptfs/super.c +++ b/fs/ecryptfs/super.c @@ -170,7 +170,10 @@ static int ecryptfs_show_options(struct seq_file *m, struct vfsmount *mnt) list_for_each_entry(walker, &mount_crypt_stat->global_auth_tok_list, mount_crypt_stat_list) { - seq_printf(m, ",ecryptfs_sig=%s", walker->sig); + if (walker->flags & ECRYPTFS_AUTH_TOK_FNEK) + seq_printf(m, ",ecryptfs_fnek_sig=%s", walker->sig); + else + seq_printf(m, ",ecryptfs_sig=%s", walker->sig); } mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex); -- cgit v1.1 From 13a791b4e63eb0537a7f804a340d6527485983b4 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Mon, 13 Apr 2009 15:29:27 -0500 Subject: eCryptfs: Fix data corruption when using ecryptfs_passthrough ecryptfs_passthrough is a mount option that allows eCryptfs to allow data to be written to non-eCryptfs files in the lower filesystem. The passthrough option was causing data corruption due to it not always being treated as a non-eCryptfs file. The first 8 bytes of an eCryptfs file contains the decrypted file size. This value was being written to the non-eCryptfs files, too. Also, extra 0x00 characters were being written to make the file size a multiple of PAGE_CACHE_SIZE. Signed-off-by: Tyler Hicks --- fs/ecryptfs/crypto.c | 21 ++------------------- fs/ecryptfs/inode.c | 9 +++++++-- fs/ecryptfs/mmap.c | 11 +++++++++++ fs/ecryptfs/read_write.c | 32 +++++++++++++++++++++----------- 4 files changed, 41 insertions(+), 32 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 8b65f28..b91851f 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -483,15 +483,7 @@ int ecryptfs_encrypt_page(struct page *page) ecryptfs_inode = page->mapping->host; crypt_stat = &(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat); - if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { - rc = ecryptfs_write_lower_page_segment(ecryptfs_inode, page, - 0, PAGE_CACHE_SIZE); - if (rc) - printk(KERN_ERR "%s: Error attempting to copy " - "page at index [%ld]\n", __func__, - page->index); - goto out; - } + BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)); enc_extent_page = alloc_page(GFP_USER); if (!enc_extent_page) { rc = -ENOMEM; @@ -620,16 +612,7 @@ int ecryptfs_decrypt_page(struct page *page) ecryptfs_inode = page->mapping->host; crypt_stat = &(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat); - if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { - rc = ecryptfs_read_lower_page_segment(page, page->index, 0, - PAGE_CACHE_SIZE, - ecryptfs_inode); - if (rc) - printk(KERN_ERR "%s: Error attempting to copy " - "page at index [%ld]\n", __func__, - page->index); - goto out; - } + BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)); enc_extent_page = alloc_page(GFP_USER); if (!enc_extent_page) { rc = -ENOMEM; diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 55b3145..5ed86e2 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -814,6 +814,13 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length) size_t num_zeros = (PAGE_CACHE_SIZE - (new_length & ~PAGE_CACHE_MASK)); + if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { + rc = vmtruncate(inode, new_length); + if (rc) + goto out_free; + rc = vmtruncate(lower_dentry->d_inode, new_length); + goto out_free; + } if (num_zeros) { char *zeros_virt; @@ -915,8 +922,6 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia) } rc = 0; crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED); - mutex_unlock(&crypt_stat->cs_mutex); - goto out; } } mutex_unlock(&crypt_stat->cs_mutex); diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 46cec2b..5c6bab9 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -449,6 +449,7 @@ int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode) struct ecryptfs_crypt_stat *crypt_stat; crypt_stat = &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat; + BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)); if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) return ecryptfs_write_inode_size_to_xattr(ecryptfs_inode); else @@ -490,6 +491,16 @@ static int ecryptfs_write_end(struct file *file, ecryptfs_printk(KERN_DEBUG, "Not a new file\n"); ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page" "(page w/ index = [0x%.16x], to = [%d])\n", index, to); + if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { + rc = ecryptfs_write_lower_page_segment(ecryptfs_inode, page, 0, + to); + if (!rc) { + rc = copied; + fsstack_copy_inode_size(ecryptfs_inode, + ecryptfs_inode_to_lower(ecryptfs_inode)); + } + goto out; + } /* Fills in zeros if 'to' goes beyond inode size */ rc = fill_zeros_to_end_of_page(page, to); if (rc) { diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c index 75c2ea9..a137c6e 100644 --- a/fs/ecryptfs/read_write.c +++ b/fs/ecryptfs/read_write.c @@ -117,13 +117,15 @@ int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset, size_t size) { struct page *ecryptfs_page; + struct ecryptfs_crypt_stat *crypt_stat; + struct inode *ecryptfs_inode = ecryptfs_file->f_dentry->d_inode; char *ecryptfs_page_virt; - loff_t ecryptfs_file_size = - i_size_read(ecryptfs_file->f_dentry->d_inode); + loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode); loff_t data_offset = 0; loff_t pos; int rc = 0; + crypt_stat = &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat; /* * if we are writing beyond current size, then start pos * at the current size - we'll fill in zeros from there. @@ -184,7 +186,13 @@ int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset, flush_dcache_page(ecryptfs_page); SetPageUptodate(ecryptfs_page); unlock_page(ecryptfs_page); - rc = ecryptfs_encrypt_page(ecryptfs_page); + if (crypt_stat->flags & ECRYPTFS_ENCRYPTED) + rc = ecryptfs_encrypt_page(ecryptfs_page); + else + rc = ecryptfs_write_lower_page_segment(ecryptfs_inode, + ecryptfs_page, + start_offset_in_page, + data_offset); page_cache_release(ecryptfs_page); if (rc) { printk(KERN_ERR "%s: Error encrypting " @@ -194,14 +202,16 @@ int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset, pos += num_bytes; } if ((offset + size) > ecryptfs_file_size) { - i_size_write(ecryptfs_file->f_dentry->d_inode, (offset + size)); - rc = ecryptfs_write_inode_size_to_metadata( - ecryptfs_file->f_dentry->d_inode); - if (rc) { - printk(KERN_ERR "Problem with " - "ecryptfs_write_inode_size_to_metadata; " - "rc = [%d]\n", rc); - goto out; + i_size_write(ecryptfs_inode, (offset + size)); + if (crypt_stat->flags & ECRYPTFS_ENCRYPTED) { + rc = ecryptfs_write_inode_size_to_metadata( + ecryptfs_inode); + if (rc) { + printk(KERN_ERR "Problem with " + "ecryptfs_write_inode_size_to_metadata; " + "rc = [%d]\n", rc); + goto out; + } } } out: -- cgit v1.1 From e77cc8d243f9f1e1d3f0799e23cc14e837ccc8c6 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Wed, 22 Apr 2009 04:08:46 -0500 Subject: eCryptfs: Remove ecryptfs_unlink_sigs warnings A feature was added to the eCryptfs umount helper to automatically unlink the keys used for an eCryptfs mount from the kernel keyring upon umount. This patch keeps the unrecognized mount option warnings for ecryptfs_unlink_sigs out of the logs. Signed-off-by: Tyler Hicks --- fs/ecryptfs/ecryptfs_kernel.h | 1 + fs/ecryptfs/main.c | 6 +++++- fs/ecryptfs/super.c | 2 ++ 3 files changed, 8 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 064c582..00b30a2 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -269,6 +269,7 @@ struct ecryptfs_crypt_stat { #define ECRYPTFS_ENCRYPT_FILENAMES 0x00000800 #define ECRYPTFS_ENCFN_USE_MOUNT_FNEK 0x00001000 #define ECRYPTFS_ENCFN_USE_FEK 0x00002000 +#define ECRYPTFS_UNLINK_SIGS 0x00004000 u32 flags; unsigned int file_version; size_t iv_bytes; diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 7638b0a..ccabd5f 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -208,7 +208,7 @@ enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig, ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata, ecryptfs_opt_encrypted_view, ecryptfs_opt_fnek_sig, ecryptfs_opt_fn_cipher, ecryptfs_opt_fn_cipher_key_bytes, - ecryptfs_opt_err }; + ecryptfs_opt_unlink_sigs, ecryptfs_opt_err }; static const match_table_t tokens = { {ecryptfs_opt_sig, "sig=%s"}, @@ -222,6 +222,7 @@ static const match_table_t tokens = { {ecryptfs_opt_fnek_sig, "ecryptfs_fnek_sig=%s"}, {ecryptfs_opt_fn_cipher, "ecryptfs_fn_cipher=%s"}, {ecryptfs_opt_fn_cipher_key_bytes, "ecryptfs_fn_key_bytes=%u"}, + {ecryptfs_opt_unlink_sigs, "ecryptfs_unlink_sigs"}, {ecryptfs_opt_err, NULL} }; @@ -402,6 +403,9 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options) fn_cipher_key_bytes; fn_cipher_key_bytes_set = 1; break; + case ecryptfs_opt_unlink_sigs: + mount_crypt_stat->flags |= ECRYPTFS_UNLINK_SIGS; + break; case ecryptfs_opt_err: default: printk(KERN_WARNING diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c index b500302..fa4c7e7 100644 --- a/fs/ecryptfs/super.c +++ b/fs/ecryptfs/super.c @@ -189,6 +189,8 @@ static int ecryptfs_show_options(struct seq_file *m, struct vfsmount *mnt) seq_printf(m, ",ecryptfs_xattr_metadata"); if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) seq_printf(m, ",ecryptfs_encrypted_view"); + if (mount_crypt_stat->flags & ECRYPTFS_UNLINK_SIGS) + seq_printf(m, ",ecryptfs_unlink_sigs"); return 0; } -- cgit v1.1 From ca8e34f2b05a8289b47907b083dc01dd654ecbde Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Wed, 22 Apr 2009 16:27:12 -0500 Subject: eCryptfs: Lock lower directory inode mutex during lookup This patch locks the lower directory inode's i_mutex before calling lookup_one_len() to find the appropriate dentry in the lower filesystem. This bug was found thanks to the warning set in commit 2f9092e1. Signed-off-by: Tyler Hicks --- fs/ecryptfs/inode.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 5ed86e2..d832526 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -379,9 +379,11 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, goto out_d_drop; } lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent); + mutex_lock(&lower_dir_dentry->d_inode->i_mutex); lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name, lower_dir_dentry, ecryptfs_dentry->d_name.len); + mutex_unlock(&lower_dir_dentry->d_inode->i_mutex); if (IS_ERR(lower_dentry)) { rc = PTR_ERR(lower_dentry); printk(KERN_ERR "%s: lookup_one_len() returned [%d] on " @@ -406,9 +408,11 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, "filename; rc = [%d]\n", __func__, rc); goto out_d_drop; } + mutex_lock(&lower_dir_dentry->d_inode->i_mutex); lower_dentry = lookup_one_len(encrypted_and_encoded_name, lower_dir_dentry, encrypted_and_encoded_name_size - 1); + mutex_unlock(&lower_dir_dentry->d_inode->i_mutex); if (IS_ERR(lower_dentry)) { rc = PTR_ERR(lower_dentry); printk(KERN_ERR "%s: lookup_one_len() returned [%d] on " -- cgit v1.1 From 3a6b42cadc112b01daf0525e5fcd90bb333a5bb3 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Thu, 16 Apr 2009 18:35:37 -0500 Subject: eCryptfs: Larger buffer for encrypted symlink targets When using filename encryption with eCryptfs, the value of the symlink in the lower filesystem is encrypted and stored as a Tag 70 packet. This results in a longer symlink target than if the target value wasn't encrypted. Users were reporting these messages in their syslog: [ 45.653441] ecryptfs_parse_tag_70_packet: max_packet_size is [56]; real packet size is [51] [ 45.653444] ecryptfs_decode_and_decrypt_filename: Could not parse tag 70 packet from filename; copying through filename as-is This was due to bufsiz, one the arguments in readlink(), being used to when allocating the buffer passed to the lower inode's readlink(). That symlink target may be very large, but when decoded and decrypted, could end up being smaller than bufsize. To fix this, the buffer passed to the lower inode's readlink() will always be PATH_MAX in size when filename encryption is enabled. Any necessary truncation occurs after the decoding and decrypting. Signed-off-by: Tyler Hicks --- fs/ecryptfs/inode.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index d832526..93bc0f8 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -640,8 +640,9 @@ static int ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz) { char *lower_buf; + size_t lower_bufsiz; struct dentry *lower_dentry; - struct ecryptfs_crypt_stat *crypt_stat; + struct ecryptfs_mount_crypt_stat *mount_crypt_stat; char *plaintext_name; size_t plaintext_name_size; mm_segment_t old_fs; @@ -652,12 +653,21 @@ ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz) rc = -EINVAL; goto out; } - crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat; + mount_crypt_stat = &ecryptfs_superblock_to_private( + dentry->d_sb)->mount_crypt_stat; + /* + * If the lower filename is encrypted, it will result in a significantly + * longer name. If needed, truncate the name after decode and decrypt. + */ + if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) + lower_bufsiz = PATH_MAX; + else + lower_bufsiz = bufsiz; /* Released in this function */ - lower_buf = kmalloc(bufsiz, GFP_KERNEL); + lower_buf = kmalloc(lower_bufsiz, GFP_KERNEL); if (lower_buf == NULL) { printk(KERN_ERR "%s: Out of memory whilst attempting to " - "kmalloc [%d] bytes\n", __func__, bufsiz); + "kmalloc [%d] bytes\n", __func__, lower_bufsiz); rc = -ENOMEM; goto out; } @@ -665,7 +675,7 @@ ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz) set_fs(get_ds()); rc = lower_dentry->d_inode->i_op->readlink(lower_dentry, (char __user *)lower_buf, - bufsiz); + lower_bufsiz); set_fs(old_fs); if (rc >= 0) { rc = ecryptfs_decode_and_decrypt_filename(&plaintext_name, @@ -678,7 +688,9 @@ ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz) rc); goto out_free_lower_buf; } - rc = copy_to_user(buf, plaintext_name, plaintext_name_size); + /* Check for bufsiz <= 0 done in sys_readlinkat() */ + rc = copy_to_user(buf, plaintext_name, + min((unsigned) bufsiz, plaintext_name_size)); if (rc) rc = -EFAULT; else -- cgit v1.1 From e84a26ce178345498a7eca0590852bcc36f1092f Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 22 Apr 2009 20:52:25 -0400 Subject: ext4: Make the extent validity check more paranoid Instead of just checking that the extent block number is greater or equal than s_first_data_block, make sure it it is not pointing into the block group descriptors, since that is clearly wrong. This helps prevent filesystem from getting very badly corrupted in case an extent block is corrupted. Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 6132353..c28ffe2 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -326,11 +326,14 @@ ext4_ext_max_entries(struct inode *inode, int depth) static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) { - ext4_fsblk_t block = ext_pblock(ext); + ext4_fsblk_t block = ext_pblock(ext), valid_block; int len = ext4_ext_get_actual_len(ext); struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; - if (unlikely(block < le32_to_cpu(es->s_first_data_block) || - ((block + len) > ext4_blocks_count(es)))) + + valid_block = le32_to_cpu(es->s_first_data_block) + + EXT4_SB(inode->i_sb)->s_gdb_count; + if (unlikely(block <= valid_block || + ((block + len) > ext4_blocks_count(es)))) return 0; else return 1; @@ -339,10 +342,13 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) static int ext4_valid_extent_idx(struct inode *inode, struct ext4_extent_idx *ext_idx) { - ext4_fsblk_t block = idx_pblock(ext_idx); + ext4_fsblk_t block = idx_pblock(ext_idx), valid_block; struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; - if (unlikely(block < le32_to_cpu(es->s_first_data_block) || - (block >= ext4_blocks_count(es)))) + + valid_block = le32_to_cpu(es->s_first_data_block) + + EXT4_SB(inode->i_sb)->s_gdb_count; + if (unlikely(block <= valid_block || + (block >= ext4_blocks_count(es)))) return 0; else return 1; -- cgit v1.1 From b5451f7b2694b04d9f912f6cf09db1729f291996 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 22 Apr 2009 21:00:36 -0400 Subject: ext4: Fix potential inode allocation soft lockup in Orlov allocator If the Orlov allocator is having trouble finding an appropriate block group, the fallback code could loop forever, causing a soft lockup warning in find_group_orlov(): BUG: soft lockup - CPU#0 stuck for 61s! [cp:11728] ... Pid: 11728, comm: cp Not tainted (2.6.30-rc1-dirty #77) Lenovo EIP: 0060:[] EFLAGS: 00000246 CPU: 0 EIP is at ext4_get_group_desc+0x54/0x9d ... Call Trace: [] find_group_orlov+0x2ee/0x334 [] ? sched_clock+0x8/0xb [] ext4_new_inode+0x2cf/0xb1a Signed-off-by: "Theodore Ts'o" --- fs/ext4/ialloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index cbce5aa..f18e0a0 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -585,6 +585,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, fallback: ngroups = sbi->s_groups_count; avefreei = freei / ngroups; +fallback_retry: parent_group = EXT4_I(parent)->i_block_group; for (i = 0; i < ngroups; i++) { grp = (parent_group + i) % ngroups; @@ -602,7 +603,7 @@ fallback: * filesystems the above test can fail to find any blockgroups */ avefreei = 0; - goto fallback; + goto fallback_retry; } return -1; -- cgit v1.1 From d8bd504ab800c8e9aadb983914a33e7166320bec Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Thu, 23 Apr 2009 08:54:02 +0100 Subject: GFS2: Fix bug in block allocation The new bitfit algorithm was counting from the wrong end of 64 bit words in the bitfield. This fixes it by using __ffs64 instead of fls64 Signed-off-by: Steven Whitehouse --- fs/gfs2/rgrp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index f03d024..c9786a4 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -212,8 +212,7 @@ static u32 gfs2_bitfit(const u8 *buf, const unsigned int len, if (tmp == 0) return BFITNOENT; ptr--; - bit = fls64(tmp); - bit--; /* fls64 always adds one to the bit count */ + bit = __ffs64(tmp); bit /= 2; /* two bits per entry in the bitmap */ return (((const unsigned char *)ptr - buf) * GFS2_NBBY) + bit; } -- cgit v1.1 From d9ba7615bfd8bb06f79c853f9dfff9e93a837941 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Thu, 23 Apr 2009 08:59:41 +0100 Subject: GFS2: Ensure that the inode goal block settings are updated GFS2 has a goal block associated with each inode indicating the search start position for future block allocations (in fact there are two, but thats for backward compatibility with GFS1 as they are set to identical locations in GFS2). In some circumstances, depending on the ordering of updates to the inode it was possible for the goal block settings to not be updated on disk. This patch ensures that the goal block will always get updated, thus reducing the potential for searching the same (already allocated) blocks again when looking for free space during block allocation. Signed-off-by: Steven Whitehouse --- fs/gfs2/rgrp.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index c9786a4..5650382 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1444,10 +1444,12 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart, u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct buffer_head *dibh; struct gfs2_alloc *al = ip->i_alloc; struct gfs2_rgrpd *rgd = al->al_rgd; u32 goal, blk; u64 block; + int error; if (rgrp_contains_block(rgd, ip->i_goal)) goal = ip->i_goal - rgd->rd_data0; @@ -1460,7 +1462,13 @@ u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n) rgd->rd_last_alloc = blk; block = rgd->rd_data0 + blk; ip->i_goal = block; - + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error == 0) { + struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data; + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + di->di_goal_meta = di->di_goal_data = cpu_to_be64(ip->i_goal); + brelse(dibh); + } gfs2_assert_withdraw(sdp, rgd->rd_free >= *n); rgd->rd_free -= *n; -- cgit v1.1 From e1c805309d19c69d4ebeac38724076fa86feacdf Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Thu, 23 Apr 2009 13:58:08 +0200 Subject: [S390] /proc/stat idle field for idle cpus The cpu idle field in the output of /proc/stat is too small for cpus that have been idle for more than a tick. Add the architecture hook arch_idle_time that allows to add the not accounted idle time of a sleeping cpu without waking the cpu. The s390 implementation of arch_idle_time uses the already existing s390_idle_data per_cpu variable to find the sleep time of a neighboring idle cpu. Signed-off-by: Martin Schwidefsky --- fs/proc/stat.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/proc/stat.c b/fs/proc/stat.c index f75efa2..81e4eb6 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -18,6 +18,9 @@ #ifndef arch_irq_stat #define arch_irq_stat() 0 #endif +#ifndef arch_idle_time +#define arch_idle_time(cpu) 0 +#endif static int show_stat(struct seq_file *p, void *v) { @@ -40,6 +43,7 @@ static int show_stat(struct seq_file *p, void *v) nice = cputime64_add(nice, kstat_cpu(i).cpustat.nice); system = cputime64_add(system, kstat_cpu(i).cpustat.system); idle = cputime64_add(idle, kstat_cpu(i).cpustat.idle); + idle = cputime64_add(idle, arch_idle_time(i)); iowait = cputime64_add(iowait, kstat_cpu(i).cpustat.iowait); irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq); softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq); @@ -69,6 +73,7 @@ static int show_stat(struct seq_file *p, void *v) nice = kstat_cpu(i).cpustat.nice; system = kstat_cpu(i).cpustat.system; idle = kstat_cpu(i).cpustat.idle; + idle = cputime64_add(idle, arch_idle_time(i)); iowait = kstat_cpu(i).cpustat.iowait; irq = kstat_cpu(i).cpustat.irq; softirq = kstat_cpu(i).cpustat.softirq; -- cgit v1.1 From 8c652f96d3852b97a49c331cd0bb02d22f3cb31b Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 24 Apr 2009 01:01:56 +0200 Subject: do_execve() must not clear fs->in_exec if it was set by another thread If do_execve() fails after check_unsafe_exec(), it clears fs->in_exec unconditionally. This is wrong if we race with our sub-thread which also does do_execve: Two threads T1 and T2 and another process P, all share the same ->fs. T1 starts do_execve(BAD_FILE). It calls check_unsafe_exec(), since ->fs is shared, we set LSM_UNSAFE but not ->in_exec. P exits and decrements fs->users. T2 starts do_execve(), calls check_unsafe_exec(), now ->fs is not shared, we set fs->in_exec. T1 continues, open_exec(BAD_FILE) fails, we clear ->in_exec and return to the user-space. T1 does clone(CLONE_FS /* without CLONE_THREAD */). T2 continues without LSM_UNSAFE_SHARE while ->fs is shared with another process. Change check_unsafe_exec() to return res = 1 if we set ->in_exec, and change do_execve() to clear ->in_exec depending on res. When do_execve() suceeds, it is safe to clear ->in_exec unconditionally. It can be set only if we don't share ->fs with another process, and since we already killed all sub-threads either ->in_exec == 0 or we are the only user of this ->fs. Also, we do not need fs->lock to clear fs->in_exec. Signed-off-by: Oleg Nesterov Acked-by: Roland McGrath Acked-by: Hugh Dickins Signed-off-by: Linus Torvalds --- fs/compat.c | 11 +++++------ fs/exec.c | 19 ++++++++++--------- 2 files changed, 15 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/compat.c b/fs/compat.c index 379a399..681ed81 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1476,6 +1476,7 @@ int compat_do_execve(char * filename, struct linux_binprm *bprm; struct file *file; struct files_struct *displaced; + bool clear_in_exec; int retval; retval = unshare_files(&displaced); @@ -1498,8 +1499,9 @@ int compat_do_execve(char * filename, goto out_unlock; retval = check_unsafe_exec(bprm); - if (retval) + if (retval < 0) goto out_unlock; + clear_in_exec = retval; file = open_exec(filename); retval = PTR_ERR(file); @@ -1546,9 +1548,7 @@ int compat_do_execve(char * filename, goto out; /* execve succeeded */ - write_lock(¤t->fs->lock); current->fs->in_exec = 0; - write_unlock(¤t->fs->lock); current->in_execve = 0; mutex_unlock(¤t->cred_exec_mutex); acct_update_integrals(current); @@ -1568,9 +1568,8 @@ out_file: } out_unmark: - write_lock(¤t->fs->lock); - current->fs->in_exec = 0; - write_unlock(¤t->fs->lock); + if (clear_in_exec) + current->fs->in_exec = 0; out_unlock: current->in_execve = 0; diff --git a/fs/exec.c b/fs/exec.c index 052a961..a2e6989 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1077,9 +1077,11 @@ int check_unsafe_exec(struct linux_binprm *bprm) if (p->fs->users > n_fs) { bprm->unsafe |= LSM_UNSAFE_SHARE; } else { - if (p->fs->in_exec) - res = -EAGAIN; - p->fs->in_exec = 1; + res = -EAGAIN; + if (!p->fs->in_exec) { + p->fs->in_exec = 1; + res = 1; + } } unlock_task_sighand(p, &flags); @@ -1284,6 +1286,7 @@ int do_execve(char * filename, struct linux_binprm *bprm; struct file *file; struct files_struct *displaced; + bool clear_in_exec; int retval; retval = unshare_files(&displaced); @@ -1306,8 +1309,9 @@ int do_execve(char * filename, goto out_unlock; retval = check_unsafe_exec(bprm); - if (retval) + if (retval < 0) goto out_unlock; + clear_in_exec = retval; file = open_exec(filename); retval = PTR_ERR(file); @@ -1355,9 +1359,7 @@ int do_execve(char * filename, goto out; /* execve succeeded */ - write_lock(¤t->fs->lock); current->fs->in_exec = 0; - write_unlock(¤t->fs->lock); current->in_execve = 0; mutex_unlock(¤t->cred_exec_mutex); acct_update_integrals(current); @@ -1377,9 +1379,8 @@ out_file: } out_unmark: - write_lock(¤t->fs->lock); - current->fs->in_exec = 0; - write_unlock(¤t->fs->lock); + if (clear_in_exec) + current->fs->in_exec = 0; out_unlock: current->in_execve = 0; -- cgit v1.1 From 437f7fdb607f32b737e4da9f14bebcfdac2c90c3 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 24 Apr 2009 01:02:45 +0200 Subject: check_unsafe_exec: s/lock_task_sighand/rcu_read_lock/ write_lock(¤t->fs->lock) guarantees we can't wrongly miss LSM_UNSAFE_SHARE, this is what we care about. Use rcu_read_lock() instead of ->siglock to iterate over the sub-threads. We must see all CLONE_THREAD|CLONE_FS threads which didn't pass exit_fs(), it takes fs->lock too. With or without this patch we can miss the freshly cloned thread and set LSM_UNSAFE_SHARE, we don't care. Signed-off-by: Oleg Nesterov Acked-by: Roland McGrath [ Fixed lock/unlock typo - Hugh ] Acked-by: Hugh Dickins Signed-off-by: Linus Torvalds --- fs/exec.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index a2e6989..a3a8ce8 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1060,7 +1060,6 @@ EXPORT_SYMBOL(install_exec_creds); int check_unsafe_exec(struct linux_binprm *bprm) { struct task_struct *p = current, *t; - unsigned long flags; unsigned n_fs; int res = 0; @@ -1068,11 +1067,12 @@ int check_unsafe_exec(struct linux_binprm *bprm) n_fs = 1; write_lock(&p->fs->lock); - lock_task_sighand(p, &flags); + rcu_read_lock(); for (t = next_thread(p); t != p; t = next_thread(t)) { if (t->fs == p->fs) n_fs++; } + rcu_read_unlock(); if (p->fs->users > n_fs) { bprm->unsafe |= LSM_UNSAFE_SHARE; @@ -1083,8 +1083,6 @@ int check_unsafe_exec(struct linux_binprm *bprm) res = 1; } } - - unlock_task_sighand(p, &flags); write_unlock(&p->fs->lock); return res; -- cgit v1.1 From 485c26ec70f823f2a9cf45982b724893e53a859e Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 24 Apr 2009 13:43:20 -0400 Subject: ext4: Fix softlockup caused by illegal i_file_acl value in on-disk inode If the block containing external extended attributes (which is stored in i_file_acl and i_file_acl_high) is larger than the on-disk filesystem, the process which tried to access the extended attributes will endlessly issue kernel printks complaining that "__find_get_block_slow() failed", locking up that CPU until the system is forcibly rebooted. So when we read in the inode, make sure the i_file_acl value is legal, and if not, flag the filesystem as being corrupted. Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c6bd6ce..cab75bb 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4409,7 +4409,17 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; } - if (ei->i_flags & EXT4_EXTENTS_FL) { + if (ei->i_file_acl && + ((ei->i_file_acl < + (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) + + EXT4_SB(sb)->s_gdb_count)) || + (ei->i_file_acl >= ext4_blocks_count(EXT4_SB(sb)->s_es)))) { + ext4_error(sb, __func__, + "bad extended attribute block %llu in inode #%lu", + ei->i_file_acl, inode->i_ino); + ret = -EIO; + goto bad_inode; + } else if (ei->i_flags & EXT4_EXTENTS_FL) { /* Validate extent which is part of inode */ ret = ext4_ext_check_inode(inode); } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || -- cgit v1.1 From 97e728d4353f38c87bf0804cdfd79a9b13fc2c3e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Apr 2009 17:40:57 -0400 Subject: Btrfs: try to keep a healthy ratio of metadata vs data block groups This patch makes the chunk allocator keep a good ratio of metadata vs data block groups. By default for every 8 data block groups, we'll allocate 1 metadata chunk, or about 12% of the disk will be allocated for metadata. This can be changed by specifying the metadata_ratio mount option. This is simply the number of data block groups that have to be allocated to force a metadata chunk allocation. By making sure we allocate metadata chunks more often, we are less likely to get into situations where the whole disk has been allocated as data block groups. Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 3 +++ fs/btrfs/disk-io.c | 1 + fs/btrfs/extent-tree.c | 28 +++++++++++++++++++++++++++- fs/btrfs/super.c | 12 +++++++++++- 4 files changed, 42 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index ad96495..213535f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -881,6 +881,9 @@ struct btrfs_fs_info { u64 metadata_alloc_profile; u64 system_alloc_profile; + unsigned data_chunk_allocations; + unsigned metadata_ratio; + void *bdev_holder; }; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a6b8374..44c94d8 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1604,6 +1604,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->btree_inode = new_inode(sb); fs_info->btree_inode->i_ino = 1; fs_info->btree_inode->i_nlink = 1; + fs_info->metadata_ratio = 8; fs_info->thread_pool_size = min_t(unsigned long, num_online_cpus() + 2, 8); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 178df4c..2895a83 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1918,15 +1918,29 @@ void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, spin_unlock(&info->lock); } +static void force_metadata_allocation(struct btrfs_fs_info *info) +{ + struct list_head *head = &info->space_info; + struct btrfs_space_info *found; + + rcu_read_lock(); + list_for_each_entry_rcu(found, head, list) { + if (found->flags & BTRFS_BLOCK_GROUP_METADATA) + found->force_alloc = 1; + } + rcu_read_unlock(); +} + static int do_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 alloc_bytes, u64 flags, int force) { struct btrfs_space_info *space_info; + struct btrfs_fs_info *fs_info = extent_root->fs_info; u64 thresh; int ret = 0; - mutex_lock(&extent_root->fs_info->chunk_mutex); + mutex_lock(&fs_info->chunk_mutex); flags = btrfs_reduce_alloc_profile(extent_root, flags); @@ -1958,6 +1972,18 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, } spin_unlock(&space_info->lock); + /* + * if we're doing a data chunk, go ahead and make sure that + * we keep a reasonable number of metadata chunks allocated in the + * FS as well. + */ + if (flags & BTRFS_BLOCK_GROUP_DATA) { + fs_info->data_chunk_allocations++; + if (!(fs_info->data_chunk_allocations % + fs_info->metadata_ratio)) + force_metadata_allocation(fs_info); + } + ret = btrfs_alloc_chunk(trans, extent_root, flags); if (ret) space_info->full = 1; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 9744af9..30c9a8c 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -68,7 +68,7 @@ enum { Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow, Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_notreelog, - Opt_flushoncommit, Opt_err, + Opt_ratio, Opt_flushoncommit, Opt_err, }; static match_table_t tokens = { @@ -87,6 +87,7 @@ static match_table_t tokens = { {Opt_noacl, "noacl"}, {Opt_notreelog, "notreelog"}, {Opt_flushoncommit, "flushoncommit"}, + {Opt_ratio, "metadata_ratio=%d"}, {Opt_err, NULL}, }; @@ -234,6 +235,15 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) printk(KERN_INFO "btrfs: turning on flush-on-commit\n"); btrfs_set_opt(info->mount_opt, FLUSHONCOMMIT); break; + case Opt_ratio: + intarg = 0; + match_int(&args[0], &intarg); + if (intarg) { + info->metadata_ratio = intarg; + printk(KERN_INFO "btrfs: metadata ratio %d\n", + info->metadata_ratio); + } + break; default: break; } -- cgit v1.1 From 2ea2544ef5dad5cac52f1e4c7b812631274fc1cb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 13 Apr 2009 15:32:28 +0200 Subject: Btrfs: simplify makefile Get rid of the hacks for building out of tree, and always use += for assigning to the object lists. Signed-off-by: Christoph Hellwig Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 9adf5e4..9421284 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -1,25 +1,10 @@ -ifneq ($(KERNELRELEASE),) -# kbuild part of makefile obj-$(CONFIG_BTRFS_FS) := btrfs.o -btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ + +btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ file-item.o inode-item.o inode-map.o disk-io.o \ transaction.o inode.o file.o tree-defrag.o \ extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \ compression.o delayed-ref.o -else - -# Normal Makefile - -KERNELDIR := /lib/modules/`uname -r`/build -all: - $(MAKE) -C $(KERNELDIR) M=`pwd` CONFIG_BTRFS_FS=m modules - -modules_install: - $(MAKE) -C $(KERNELDIR) M=`pwd` modules_install -clean: - $(MAKE) -C $(KERNELDIR) M=`pwd` clean - -endif -- cgit v1.1 From 0d4bf11e5309eff64272a49e1ea55658372abc56 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 13 Apr 2009 15:33:56 +0200 Subject: Btrfs: don't export symbols Currently the extent_map code is only for btrfs so don't export it's symbols. Signed-off-by: Christoph Hellwig Signed-off-by: Chris Mason --- fs/btrfs/extent_map.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index b187917..9827fa1 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -43,7 +43,6 @@ void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask) tree->map.rb_node = NULL; spin_lock_init(&tree->lock); } -EXPORT_SYMBOL(extent_map_tree_init); /** * alloc_extent_map - allocate new extent map structure @@ -64,7 +63,6 @@ struct extent_map *alloc_extent_map(gfp_t mask) atomic_set(&em->refs, 1); return em; } -EXPORT_SYMBOL(alloc_extent_map); /** * free_extent_map - drop reference count of an extent_map @@ -83,7 +81,6 @@ void free_extent_map(struct extent_map *em) kmem_cache_free(extent_map_cache, em); } } -EXPORT_SYMBOL(free_extent_map); static struct rb_node *tree_insert(struct rb_root *root, u64 offset, struct rb_node *node) @@ -264,7 +261,6 @@ int add_extent_mapping(struct extent_map_tree *tree, out: return ret; } -EXPORT_SYMBOL(add_extent_mapping); /* simple helper to do math around the end of an extent, handling wrap */ static u64 range_end(u64 start, u64 len) @@ -326,7 +322,6 @@ found: out: return em; } -EXPORT_SYMBOL(lookup_extent_mapping); /** * remove_extent_mapping - removes an extent_map from the extent tree @@ -346,4 +341,3 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) em->in_tree = 0; return ret; } -EXPORT_SYMBOL(remove_extent_mapping); -- cgit v1.1 From 9601e3f6336f6ca66929f451b1f66085e68e36e3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 13 Apr 2009 15:33:09 +0200 Subject: Btrfs: kill btrfs_cache_create Just use kmem_cache_create directly. Signed-off-by: Christoph Hellwig Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 18 ++++++------------ fs/btrfs/extent_map.c | 11 +++-------- fs/btrfs/inode.c | 42 +++++++++++++++++++----------------------- 3 files changed, 28 insertions(+), 43 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 05a1c42..c33b540 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -17,12 +17,6 @@ #include "ctree.h" #include "btrfs_inode.h" -/* temporary define until extent_map moves out of btrfs */ -struct kmem_cache *btrfs_cache_create(const char *name, size_t size, - unsigned long extra_flags, - void (*ctor)(void *, struct kmem_cache *, - unsigned long)); - static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; @@ -58,15 +52,15 @@ struct extent_page_data { int __init extent_io_init(void) { - extent_state_cache = btrfs_cache_create("extent_state", - sizeof(struct extent_state), 0, - NULL); + extent_state_cache = kmem_cache_create("extent_state", + sizeof(struct extent_state), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!extent_state_cache) return -ENOMEM; - extent_buffer_cache = btrfs_cache_create("extent_buffers", - sizeof(struct extent_buffer), 0, - NULL); + extent_buffer_cache = kmem_cache_create("extent_buffers", + sizeof(struct extent_buffer), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!extent_buffer_cache) goto free_state_cache; return 0; diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 9827fa1..30c9365 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -6,19 +6,14 @@ #include #include "extent_map.h" -/* temporary define until extent_map moves out of btrfs */ -struct kmem_cache *btrfs_cache_create(const char *name, size_t size, - unsigned long extra_flags, - void (*ctor)(void *, struct kmem_cache *, - unsigned long)); static struct kmem_cache *extent_map_cache; int __init extent_map_init(void) { - extent_map_cache = btrfs_cache_create("extent_map", - sizeof(struct extent_map), 0, - NULL); + extent_map_cache = kmem_cache_create("extent_map", + sizeof(struct extent_map), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!extent_map_cache) return -ENOMEM; return 0; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 65219f6..176b6cc 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4640,39 +4640,35 @@ void btrfs_destroy_cachep(void) kmem_cache_destroy(btrfs_path_cachep); } -struct kmem_cache *btrfs_cache_create(const char *name, size_t size, - unsigned long extra_flags, - void (*ctor)(void *)) -{ - return kmem_cache_create(name, size, 0, (SLAB_RECLAIM_ACCOUNT | - SLAB_MEM_SPREAD | extra_flags), ctor); -} - int btrfs_init_cachep(void) { - btrfs_inode_cachep = btrfs_cache_create("btrfs_inode_cache", - sizeof(struct btrfs_inode), - 0, init_once); + btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache", + sizeof(struct btrfs_inode), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once); if (!btrfs_inode_cachep) goto fail; - btrfs_trans_handle_cachep = - btrfs_cache_create("btrfs_trans_handle_cache", - sizeof(struct btrfs_trans_handle), - 0, NULL); + + btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache", + sizeof(struct btrfs_trans_handle), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!btrfs_trans_handle_cachep) goto fail; - btrfs_transaction_cachep = btrfs_cache_create("btrfs_transaction_cache", - sizeof(struct btrfs_transaction), - 0, NULL); + + btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache", + sizeof(struct btrfs_transaction), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!btrfs_transaction_cachep) goto fail; - btrfs_path_cachep = btrfs_cache_create("btrfs_path_cache", - sizeof(struct btrfs_path), - 0, NULL); + + btrfs_path_cachep = kmem_cache_create("btrfs_path_cache", + sizeof(struct btrfs_path), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!btrfs_path_cachep) goto fail; - btrfs_bit_radix_cachep = btrfs_cache_create("btrfs_radix", 256, - SLAB_DESTROY_BY_RCU, NULL); + + btrfs_bit_radix_cachep = kmem_cache_create("btrfs_radix", 256, 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | + SLAB_DESTROY_BY_RCU, NULL); if (!btrfs_bit_radix_cachep) goto fail; return 0; -- cgit v1.1 From e980b50cda1610f1c17978d9b7fd311a9dd93877 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 24 Apr 2009 14:39:24 -0400 Subject: Btrfs: fix fallocate deadlock on inode extent lock The btrfs fallocate call takes an extent lock on the entire range being fallocated, and then runs through insert_reserved_extent on each extent as they are allocated. The problem with this is that btrfs_drop_extents may decide to try and take the same extent lock fallocate was already holding. The solution used here is to push down knowledge of the range that is already locked going into btrfs_drop_extents. It turns out that at least one other caller had the same bug. Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 3 ++- fs/btrfs/file.c | 11 ++++++----- fs/btrfs/inode.c | 27 ++++++++++++++++++--------- fs/btrfs/ioctl.c | 3 ++- fs/btrfs/tree-log.c | 2 +- 5 files changed, 29 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 213535f..4414a5d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2177,7 +2177,8 @@ int btrfs_check_file(struct btrfs_root *root, struct inode *inode); extern struct file_operations btrfs_file_operations; int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, - u64 start, u64 end, u64 inline_limit, u64 *hint_block); + u64 start, u64 end, u64 locked_end, + u64 inline_limit, u64 *hint_block); int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, u64 start, u64 end); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 482f8db..da3ed96 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -363,15 +363,16 @@ out: */ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, - u64 start, u64 end, u64 inline_limit, u64 *hint_byte) + u64 start, u64 end, u64 locked_end, + u64 inline_limit, u64 *hint_byte) { u64 extent_end = 0; - u64 locked_end = end; u64 search_start = start; u64 leaf_start; u64 ram_bytes = 0; u64 orig_parent = 0; u64 disk_bytenr = 0; + u64 orig_locked_end = locked_end; u8 compression; u8 encryption; u16 other_encoding = 0; @@ -684,9 +685,9 @@ next_slot: } out: btrfs_free_path(path); - if (locked_end > end) { - unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1, - GFP_NOFS); + if (locked_end > orig_locked_end) { + unlock_extent(&BTRFS_I(inode)->io_tree, orig_locked_end, + locked_end - 1, GFP_NOFS); } btrfs_check_file(root, inode); return ret; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 176b6cc..2fdb299 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -234,7 +234,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, } ret = btrfs_drop_extents(trans, root, inode, start, - aligned_end, start, &hint_byte); + aligned_end, aligned_end, start, &hint_byte); BUG_ON(ret); if (isize > actual_end) @@ -1439,6 +1439,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, struct inode *inode, u64 file_pos, u64 disk_bytenr, u64 disk_num_bytes, u64 num_bytes, u64 ram_bytes, + u64 locked_end, u8 compression, u8 encryption, u16 other_encoding, int extent_type) { @@ -1455,7 +1456,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, path->leave_spinning = 1; ret = btrfs_drop_extents(trans, root, inode, file_pos, - file_pos + num_bytes, file_pos, &hint); + file_pos + num_bytes, locked_end, + file_pos, &hint); BUG_ON(ret); ins.objectid = inode->i_ino; @@ -1590,6 +1592,8 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) ordered_extent->disk_len, ordered_extent->len, ordered_extent->len, + ordered_extent->file_offset + + ordered_extent->len, compressed, 0, 0, BTRFS_FILE_EXTENT_REG); BUG_ON(ret); @@ -2877,6 +2881,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) err = btrfs_drop_extents(trans, root, inode, cur_offset, cur_offset + hole_size, + block_end, cur_offset, &hint_byte); if (err) break; @@ -4968,7 +4973,7 @@ out_fail: static int prealloc_file_range(struct btrfs_trans_handle *trans, struct inode *inode, u64 start, u64 end, - u64 alloc_hint, int mode) + u64 locked_end, u64 alloc_hint, int mode) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key ins; @@ -4989,7 +4994,8 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans, ret = insert_reserved_file_extent(trans, inode, cur_offset, ins.objectid, ins.offset, ins.offset, - ins.offset, 0, 0, 0, + ins.offset, locked_end, + 0, 0, 0, BTRFS_FILE_EXTENT_PREALLOC); BUG_ON(ret); num_bytes -= ins.offset; @@ -5018,6 +5024,7 @@ static long btrfs_fallocate(struct inode *inode, int mode, u64 alloc_start; u64 alloc_end; u64 alloc_hint = 0; + u64 locked_end; u64 mask = BTRFS_I(inode)->root->sectorsize - 1; struct extent_map *em; struct btrfs_trans_handle *trans; @@ -5039,6 +5046,7 @@ static long btrfs_fallocate(struct inode *inode, int mode, goto out; } + locked_end = alloc_end - 1; while (1) { struct btrfs_ordered_extent *ordered; @@ -5051,8 +5059,8 @@ static long btrfs_fallocate(struct inode *inode, int mode, /* the extent lock is ordered inside the running * transaction */ - lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, - alloc_end - 1, GFP_NOFS); + lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, + GFP_NOFS); ordered = btrfs_lookup_first_ordered_extent(inode, alloc_end - 1); if (ordered && @@ -5060,7 +5068,7 @@ static long btrfs_fallocate(struct inode *inode, int mode, ordered->file_offset < alloc_end) { btrfs_put_ordered_extent(ordered); unlock_extent(&BTRFS_I(inode)->io_tree, - alloc_start, alloc_end - 1, GFP_NOFS); + alloc_start, locked_end, GFP_NOFS); btrfs_end_transaction(trans, BTRFS_I(inode)->root); /* @@ -5085,7 +5093,8 @@ static long btrfs_fallocate(struct inode *inode, int mode, last_byte = (last_byte + mask) & ~mask; if (em->block_start == EXTENT_MAP_HOLE) { ret = prealloc_file_range(trans, inode, cur_offset, - last_byte, alloc_hint, mode); + last_byte, locked_end + 1, + alloc_hint, mode); if (ret < 0) { free_extent_map(em); break; @@ -5101,7 +5110,7 @@ static long btrfs_fallocate(struct inode *inode, int mode, break; } } - unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, alloc_end - 1, + unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, GFP_NOFS); btrfs_end_transaction(trans, BTRFS_I(inode)->root); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 7594bec..f4e5d2e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -830,7 +830,8 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, BUG_ON(!trans); /* punch hole in destination first */ - btrfs_drop_extents(trans, root, inode, off, off+len, 0, &hint_byte); + btrfs_drop_extents(trans, root, inode, off, off + len, + off + len, 0, &hint_byte); /* clone data */ key.objectid = src->i_ino; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 25f20ea..db5e212 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -536,7 +536,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, saved_nbytes = inode_get_bytes(inode); /* drop any overlapping extents */ ret = btrfs_drop_extents(trans, root, inode, - start, extent_end, start, &alloc_hint); + start, extent_end, extent_end, start, &alloc_hint); BUG_ON(ret); if (found_type == BTRFS_FILE_EXTENT_REG || -- cgit v1.1 From 59bc5c758ece00fb0b2a170dd8fbbf31f1856c8a Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 24 Apr 2009 14:39:25 -0400 Subject: Btrfs: fix deadlocks and stalls on dead root removal After a transaction commit, the old root of the subvol btrees are sent through snapshot removal. This is what actually frees up any blocks replaced by COW, and anything the old blocks pointed to. Snapshot deletion will pause when a transaction commit has started, which helps to avoid a huge amount of delayed reference count updates piling up as the transaction is trying to close. But, this pause happens after the snapshot deletion process has asked other procs on the system to throttle back a bit so that it can make progress. We don't want to throttle everyone while we're waiting for the transaction commit, it leads to deadlocks in the user transaction ioctls used by Ceph and makes things slower in general. This patch changes things to avoid the throttling while we sleep. Signed-off-by: Chris Mason --- fs/btrfs/transaction.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 2869b33..01b1436 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -687,7 +687,13 @@ static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info) prepare_to_wait(&info->transaction_wait, &wait, TASK_UNINTERRUPTIBLE); mutex_unlock(&info->trans_mutex); + + atomic_dec(&info->throttles); + wake_up(&info->transaction_throttle); + schedule(); + + atomic_inc(&info->throttles); mutex_lock(&info->trans_mutex); finish_wait(&info->transaction_wait, &wait); } -- cgit v1.1 From a9e817425dc0baede8ebe5fbc9984a640257432b Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 24 Apr 2009 16:11:18 -0400 Subject: ext4: Ignore i_file_acl_high unless EXT4_FEATURE_INCOMPAT_64BIT is present Don't try to look at i_file_acl_high unless the INCOMPAT_64BIT feature bit is set. The field is normally zero, but older versions of e2fsck didn't automatically check to make sure of this, so in the spirit of "be liberal in what you accept", don't look at i_file_acl_high unless we are using a 64-bit filesystem. Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index cab75bb..1146003 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4357,11 +4357,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ei->i_flags = le32_to_cpu(raw_inode->i_flags); inode->i_blocks = ext4_inode_blocks(raw_inode, ei); ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo); - if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != - cpu_to_le32(EXT4_OS_HURD)) { + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) ei->i_file_acl |= ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; - } inode->i_size = ext4_isize(raw_inode); ei->i_disksize = inode->i_size; inode->i_generation = le32_to_cpu(raw_inode->i_generation); -- cgit v1.1 From 84baf74bf23bbe9f3deafb5d2f27e2b5dc0bc052 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 23 Apr 2009 16:41:13 +0100 Subject: ROMFS: romfs_lookup() shouldn't be doing a partial name comparison romfs_lookup() should be using a routine akin to strcmp() on the backing store, rather than one akin to strncmp(). If it uses the latter, it's liable to match /bin/shutdown when looking up /bin/sh. Signed-off-by: David Howells Tested-by: Michal Simek Signed-off-by: Linus Torvalds --- fs/romfs/internal.h | 4 ++-- fs/romfs/storage.c | 67 +++++++++++++++++++++++++++++++++++++++-------------- fs/romfs/super.c | 4 ++-- 3 files changed, 53 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/fs/romfs/internal.h b/fs/romfs/internal.h index 06044a9..95217b8 100644 --- a/fs/romfs/internal.h +++ b/fs/romfs/internal.h @@ -43,5 +43,5 @@ extern int romfs_dev_read(struct super_block *sb, unsigned long pos, void *buf, size_t buflen); extern ssize_t romfs_dev_strnlen(struct super_block *sb, unsigned long pos, size_t maxlen); -extern int romfs_dev_strncmp(struct super_block *sb, unsigned long pos, - const char *str, size_t size); +extern int romfs_dev_strcmp(struct super_block *sb, unsigned long pos, + const char *str, size_t size); diff --git a/fs/romfs/storage.c b/fs/romfs/storage.c index 7e3e1e1..66ce9dd 100644 --- a/fs/romfs/storage.c +++ b/fs/romfs/storage.c @@ -67,26 +67,35 @@ static ssize_t romfs_mtd_strnlen(struct super_block *sb, * compare a string to one in a romfs image on MTD * - return 1 if matched, 0 if differ, -ve if error */ -static int romfs_mtd_strncmp(struct super_block *sb, unsigned long pos, - const char *str, size_t size) +static int romfs_mtd_strcmp(struct super_block *sb, unsigned long pos, + const char *str, size_t size) { - u_char buf[16]; + u_char buf[17]; size_t len, segment; int ret; - /* scan the string up to 16 bytes at a time */ + /* scan the string up to 16 bytes at a time, and attempt to grab the + * trailing NUL whilst we're at it */ + buf[0] = 0xff; + while (size > 0) { - segment = min_t(size_t, size, 16); + segment = min_t(size_t, size + 1, 17); ret = ROMFS_MTD_READ(sb, pos, segment, &len, buf); if (ret < 0) return ret; + len--; if (memcmp(buf, str, len) != 0) return 0; + buf[0] = buf[len]; size -= len; pos += len; str += len; } + /* check the trailing NUL was */ + if (buf[0]) + return 0; + return 1; } #endif /* CONFIG_ROMFS_ON_MTD */ @@ -154,28 +163,48 @@ static ssize_t romfs_blk_strnlen(struct super_block *sb, * compare a string to one in a romfs image on a block device * - return 1 if matched, 0 if differ, -ve if error */ -static int romfs_blk_strncmp(struct super_block *sb, unsigned long pos, - const char *str, size_t size) +static int romfs_blk_strcmp(struct super_block *sb, unsigned long pos, + const char *str, size_t size) { struct buffer_head *bh; unsigned long offset; size_t segment; - bool x; + bool matched, terminated = false; - /* scan the string up to 16 bytes at a time */ + /* compare string up to a block at a time */ while (size > 0) { offset = pos & (ROMBSIZE - 1); segment = min_t(size_t, size, ROMBSIZE - offset); bh = sb_bread(sb, pos >> ROMBSBITS); if (!bh) return -EIO; - x = (memcmp(bh->b_data + offset, str, segment) != 0); - brelse(bh); - if (x) - return 0; + matched = (memcmp(bh->b_data + offset, str, segment) == 0); + size -= segment; pos += segment; str += segment; + if (matched && size == 0 && offset + segment < ROMBSIZE) { + if (!bh->b_data[offset + segment]) + terminated = true; + else + matched = false; + } + brelse(bh); + if (!matched) + return 0; + } + + if (!terminated) { + /* the terminating NUL must be on the first byte of the next + * block */ + BUG_ON((pos & (ROMBSIZE - 1)) != 0); + bh = sb_bread(sb, pos >> ROMBSBITS); + if (!bh) + return -EIO; + matched = !bh->b_data[0]; + brelse(bh); + if (!matched) + return 0; } return 1; @@ -234,10 +263,12 @@ ssize_t romfs_dev_strnlen(struct super_block *sb, /* * compare a string to one in romfs + * - the string to be compared to, str, may not be NUL-terminated; instead the + * string is of the specified size * - return 1 if matched, 0 if differ, -ve if error */ -int romfs_dev_strncmp(struct super_block *sb, unsigned long pos, - const char *str, size_t size) +int romfs_dev_strcmp(struct super_block *sb, unsigned long pos, + const char *str, size_t size) { size_t limit; @@ -246,16 +277,16 @@ int romfs_dev_strncmp(struct super_block *sb, unsigned long pos, return -EIO; if (size > ROMFS_MAXFN) return -ENAMETOOLONG; - if (size > limit - pos) + if (size + 1 > limit - pos) return -EIO; #ifdef CONFIG_ROMFS_ON_MTD if (sb->s_mtd) - return romfs_mtd_strncmp(sb, pos, str, size); + return romfs_mtd_strcmp(sb, pos, str, size); #endif #ifdef CONFIG_ROMFS_ON_BLOCK if (sb->s_bdev) - return romfs_blk_strncmp(sb, pos, str, size); + return romfs_blk_strcmp(sb, pos, str, size); #endif return -EIO; } diff --git a/fs/romfs/super.c b/fs/romfs/super.c index 10ca7d9..c53b5ef 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -240,8 +240,8 @@ static struct dentry *romfs_lookup(struct inode *dir, struct dentry *dentry, goto error; /* try to match the first 16 bytes of name */ - ret = romfs_dev_strncmp(dir->i_sb, offset + ROMFH_SIZE, name, - len); + ret = romfs_dev_strcmp(dir->i_sb, offset + ROMFH_SIZE, name, + len); if (ret < 0) goto error; if (ret == 1) -- cgit v1.1 From 4b2b0b9753194cad44d7295c32044b89710efd70 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 23 Apr 2009 16:41:18 +0100 Subject: ROMFS: Advance destination buffer pointer when reading from a blockdev RomFS should advance the destination buffer pointer when reading data from a blockdev source (the data may be split over multiple blocks, each requiring its own sb_read() call). Without this, all the data is copied to the beginning of the output buffer. Signed-off-by: David Howells Tested-by: Michal Simek Signed-off-by: Linus Torvalds --- fs/romfs/storage.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/romfs/storage.c b/fs/romfs/storage.c index 66ce9dd..b3208ad 100644 --- a/fs/romfs/storage.c +++ b/fs/romfs/storage.c @@ -120,6 +120,7 @@ static int romfs_blk_read(struct super_block *sb, unsigned long pos, return -EIO; memcpy(buf, bh->b_data + offset, segment); brelse(bh); + buf += segment; buflen -= segment; pos += segment; } -- cgit v1.1 From c4b5a614316c505922a522b2e35ba05ea3e08a7c Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 24 Apr 2009 18:45:35 -0400 Subject: ext4: Do not try to validate extents on special files The EXTENTS_FL flag should never be set on special files, but if it is, don't bother trying to validate that the extents tree is valid, since only files, directories, and non-fast symlinks will ever have an extent data structure. We perhaps should flag the filesystem as being corrupted if we see a special file (named pipes, device nodes, Unix domain sockets, etc.) with the EXTENTS_FL flag, but e2fsck doesn't currently check this case, so we'll just ignore this for now, since it's harmless. Without this fix, a special device with the extents flag is flagged as an error by the kernel, so it is impossible to access or delete the inode, but e2fsck doesn't see it as a problem, leading to confused/frustrated users. Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1146003..e91f978 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4407,6 +4407,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; } + ret = 0; if (ei->i_file_acl && ((ei->i_file_acl < (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) + @@ -4418,8 +4419,11 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ret = -EIO; goto bad_inode; } else if (ei->i_flags & EXT4_EXTENTS_FL) { - /* Validate extent which is part of inode */ - ret = ext4_ext_check_inode(inode); + if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + (S_ISLNK(inode->i_mode) && + !ext4_inode_is_fast_symlink(inode))) + /* Validate extent which is part of inode */ + ret = ext4_ext_check_inode(inode); } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || (S_ISLNK(inode->i_mode) && !ext4_inode_is_fast_symlink(inode))) { -- cgit v1.1 From d6397baee468809ef311e763dfc6e9f73418f8a6 Mon Sep 17 00:00:00 2001 From: Chris Ball Date: Mon, 27 Apr 2009 07:29:03 -0400 Subject: Btrfs: When shrinking, only update disk size on success Previously, we updated a device's size prior to attempting a shrink operation. This patch moves the device resizing logic to only happen if the shrink completes successfully. In the process, it introduces a new field to btrfs_device -- disk_total_bytes -- to track the on-disk size. Signed-off-by: Chris Ball Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 35 ++++++++++++++++++++++++----------- fs/btrfs/volumes.h | 3 +++ 2 files changed, 27 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index e53835b..5f01dad 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1543,7 +1543,7 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, btrfs_set_device_io_align(leaf, dev_item, device->io_align); btrfs_set_device_io_width(leaf, dev_item, device->io_width); btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); - btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes); + btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes); btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); btrfs_mark_buffer_dirty(leaf); @@ -1940,14 +1940,6 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) device->total_bytes = new_size; if (device->writeable) device->fs_devices->total_rw_bytes -= diff; - ret = btrfs_update_device(trans, device); - if (ret) { - unlock_chunks(root); - btrfs_end_transaction(trans, root); - goto done; - } - WARN_ON(diff > old_total); - btrfs_set_super_total_bytes(super_copy, old_total - diff); unlock_chunks(root); btrfs_end_transaction(trans, root); @@ -1979,7 +1971,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) length = btrfs_dev_extent_length(l, dev_extent); if (key.offset + length <= new_size) - goto done; + break; chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); @@ -1992,6 +1984,26 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) goto done; } + /* Shrinking succeeded, else we would be at "done". */ + trans = btrfs_start_transaction(root, 1); + if (!trans) { + ret = -ENOMEM; + goto done; + } + lock_chunks(root); + + device->disk_total_bytes = new_size; + /* Now btrfs_update_device() will change the on-disk size. */ + ret = btrfs_update_device(trans, device); + if (ret) { + unlock_chunks(root); + btrfs_end_transaction(trans, root); + goto done; + } + WARN_ON(diff > old_total); + btrfs_set_super_total_bytes(super_copy, old_total - diff); + unlock_chunks(root); + btrfs_end_transaction(trans, root); done: btrfs_free_path(path); return ret; @@ -3076,7 +3088,8 @@ static int fill_device_from_item(struct extent_buffer *leaf, unsigned long ptr; device->devid = btrfs_device_id(leaf, dev_item); - device->total_bytes = btrfs_device_total_bytes(leaf, dev_item); + device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); + device->total_bytes = device->disk_total_bytes; device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); device->type = btrfs_device_type(leaf, dev_item); device->io_align = btrfs_device_io_align(leaf, dev_item); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 5836327..5c3ff6d 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -61,6 +61,9 @@ struct btrfs_device { /* size of the device */ u64 total_bytes; + /* size of the disk */ + u64 disk_total_bytes; + /* bytes used */ u64 bytes_used; -- cgit v1.1 From b7967db75a38df4891b22efe1b0969b9357eb946 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 27 Apr 2009 07:29:04 -0400 Subject: Btrfs: remove #if 0 code Btrfs had some old code sitting around under #if 0, this drops it. Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 48 +------------------------------- fs/btrfs/extent_io.c | 63 ------------------------------------------ fs/btrfs/file.c | 78 ---------------------------------------------------- 3 files changed, 1 insertion(+), 188 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 44c94d8..77f9a3b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -584,18 +584,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, btrfs_set_work_high_prio(&async->work); btrfs_queue_worker(&fs_info->workers, &async->work); -#if 0 - int limit = btrfs_async_submit_limit(fs_info); - if (atomic_read(&fs_info->nr_async_submits) > limit) { - wait_event_timeout(fs_info->async_submit_wait, - (atomic_read(&fs_info->nr_async_submits) < limit), - HZ/10); - wait_event_timeout(fs_info->async_submit_wait, - (atomic_read(&fs_info->nr_async_bios) < limit), - HZ/10); - } -#endif while (atomic_read(&fs_info->async_submit_draining) && atomic_read(&fs_info->nr_async_submits)) { wait_event(fs_info->async_submit_wait, @@ -770,27 +759,6 @@ static void btree_invalidatepage(struct page *page, unsigned long offset) } } -#if 0 -static int btree_writepage(struct page *page, struct writeback_control *wbc) -{ - struct buffer_head *bh; - struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; - struct buffer_head *head; - if (!page_has_buffers(page)) { - create_empty_buffers(page, root->fs_info->sb->s_blocksize, - (1 << BH_Dirty)|(1 << BH_Uptodate)); - } - head = page_buffers(page); - bh = head; - do { - if (buffer_dirty(bh)) - csum_tree_block(root, bh, 0); - bh = bh->b_this_page; - } while (bh != head); - return block_write_full_page(page, btree_get_block, wbc); -} -#endif - static struct address_space_operations btree_aops = { .readpage = btree_readpage, .writepage = btree_writepage, @@ -1278,11 +1246,7 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits) int ret = 0; struct btrfs_device *device; struct backing_dev_info *bdi; -#if 0 - if ((bdi_bits & (1 << BDI_write_congested)) && - btrfs_congested_async(info, 0)) - return 1; -#endif + list_for_each_entry(device, &info->fs_devices->devices, dev_list) { if (!device->bdev) continue; @@ -2334,16 +2298,6 @@ int close_ctree(struct btrfs_root *root) btrfs_stop_workers(&fs_info->endio_write_workers); btrfs_stop_workers(&fs_info->submit_workers); -#if 0 - while (!list_empty(&fs_info->hashers)) { - struct btrfs_hasher *hasher; - hasher = list_entry(fs_info->hashers.next, struct btrfs_hasher, - hashers); - list_del(&hasher->hashers); - crypto_free_hash(&fs_info->hash_tfm); - kfree(hasher); - } -#endif btrfs_close_devices(fs_info->fs_devices); btrfs_mapping_tree_free(&fs_info->mapping_tree); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c33b540..fe9eb99 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1401,69 +1401,6 @@ out: return total_bytes; } -#if 0 -/* - * helper function to lock both pages and extents in the tree. - * pages must be locked first. - */ -static int lock_range(struct extent_io_tree *tree, u64 start, u64 end) -{ - unsigned long index = start >> PAGE_CACHE_SHIFT; - unsigned long end_index = end >> PAGE_CACHE_SHIFT; - struct page *page; - int err; - - while (index <= end_index) { - page = grab_cache_page(tree->mapping, index); - if (!page) { - err = -ENOMEM; - goto failed; - } - if (IS_ERR(page)) { - err = PTR_ERR(page); - goto failed; - } - index++; - } - lock_extent(tree, start, end, GFP_NOFS); - return 0; - -failed: - /* - * we failed above in getting the page at 'index', so we undo here - * up to but not including the page at 'index' - */ - end_index = index; - index = start >> PAGE_CACHE_SHIFT; - while (index < end_index) { - page = find_get_page(tree->mapping, index); - unlock_page(page); - page_cache_release(page); - index++; - } - return err; -} - -/* - * helper function to unlock both pages and extents in the tree. - */ -static int unlock_range(struct extent_io_tree *tree, u64 start, u64 end) -{ - unsigned long index = start >> PAGE_CACHE_SHIFT; - unsigned long end_index = end >> PAGE_CACHE_SHIFT; - struct page *page; - - while (index <= end_index) { - page = find_get_page(tree->mapping, index); - unlock_page(page); - page_cache_release(page); - index++; - } - unlock_extent(tree, start, end, GFP_NOFS); - return 0; -} -#endif - /* * set the private field for a given byte offset in the tree. If there isn't * an extent_state there already, this does nothing. diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index da3ed96..1d51dc3 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -272,83 +272,6 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, return 0; } -int btrfs_check_file(struct btrfs_root *root, struct inode *inode) -{ - return 0; -#if 0 - struct btrfs_path *path; - struct btrfs_key found_key; - struct extent_buffer *leaf; - struct btrfs_file_extent_item *extent; - u64 last_offset = 0; - int nritems; - int slot; - int found_type; - int ret; - int err = 0; - u64 extent_end = 0; - - path = btrfs_alloc_path(); - ret = btrfs_lookup_file_extent(NULL, root, path, inode->i_ino, - last_offset, 0); - while (1) { - nritems = btrfs_header_nritems(path->nodes[0]); - if (path->slots[0] >= nritems) { - ret = btrfs_next_leaf(root, path); - if (ret) - goto out; - nritems = btrfs_header_nritems(path->nodes[0]); - } - slot = path->slots[0]; - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &found_key, slot); - if (found_key.objectid != inode->i_ino) - break; - if (found_key.type != BTRFS_EXTENT_DATA_KEY) - goto out; - - if (found_key.offset < last_offset) { - WARN_ON(1); - btrfs_print_leaf(root, leaf); - printk(KERN_ERR "inode %lu found offset %llu " - "expected %llu\n", inode->i_ino, - (unsigned long long)found_key.offset, - (unsigned long long)last_offset); - err = 1; - goto out; - } - extent = btrfs_item_ptr(leaf, slot, - struct btrfs_file_extent_item); - found_type = btrfs_file_extent_type(leaf, extent); - if (found_type == BTRFS_FILE_EXTENT_REG) { - extent_end = found_key.offset + - btrfs_file_extent_num_bytes(leaf, extent); - } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { - struct btrfs_item *item; - item = btrfs_item_nr(leaf, slot); - extent_end = found_key.offset + - btrfs_file_extent_inline_len(leaf, extent); - extent_end = (extent_end + root->sectorsize - 1) & - ~((u64)root->sectorsize - 1); - } - last_offset = extent_end; - path->slots[0]++; - } - if (0 && last_offset < inode->i_size) { - WARN_ON(1); - btrfs_print_leaf(root, leaf); - printk(KERN_ERR "inode %lu found offset %llu size %llu\n", - inode->i_ino, (unsigned long long)last_offset, - (unsigned long long)inode->i_size); - err = 1; - - } -out: - btrfs_free_path(path); - return err; -#endif -} - /* * this is very complex, but the basic idea is to drop all extents * in the range start - end. hint_block is filled in with a block number @@ -689,7 +612,6 @@ out: unlock_extent(&BTRFS_I(inode)->io_tree, orig_locked_end, locked_end - 1, GFP_NOFS); } - btrfs_check_file(root, inode); return ret; } -- cgit v1.1 From 193f284d4985db0370a8a1bbdfb20df548cf9ffb Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 27 Apr 2009 07:29:05 -0400 Subject: Btrfs: ratelimit IO error printks Btrfs has printks for various IO errors, including bad checksums and mismatches between what we expect the block headers to contain and what we actually find on the disk. Longer term we need a real reporting mechanism for this, but for now printk is going to have to do. Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 38 +++++++++++++++++++++++++------------- fs/btrfs/inode.c | 10 ++++++---- 2 files changed, 31 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 77f9a3b..aa0c259 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -232,10 +232,14 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, memcpy(&found, result, csum_size); read_extent_buffer(buf, &val, 0, csum_size); - printk(KERN_INFO "btrfs: %s checksum verify failed " - "on %llu wanted %X found %X level %d\n", - root->fs_info->sb->s_id, - buf->start, val, found, btrfs_header_level(buf)); + if (printk_ratelimit()) { + printk(KERN_INFO "btrfs: %s checksum verify " + "failed on %llu wanted %X found %X " + "level %d\n", + root->fs_info->sb->s_id, + (unsigned long long)buf->start, val, found, + btrfs_header_level(buf)); + } if (result != (char *)&inline_result) kfree(result); return 1; @@ -268,10 +272,13 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, ret = 0; goto out; } - printk("parent transid verify failed on %llu wanted %llu found %llu\n", - (unsigned long long)eb->start, - (unsigned long long)parent_transid, - (unsigned long long)btrfs_header_generation(eb)); + if (printk_ratelimit()) { + printk("parent transid verify failed on %llu wanted %llu " + "found %llu\n", + (unsigned long long)eb->start, + (unsigned long long)parent_transid, + (unsigned long long)btrfs_header_generation(eb)); + } ret = 1; clear_extent_buffer_uptodate(io_tree, eb); out: @@ -415,9 +422,12 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, found_start = btrfs_header_bytenr(eb); if (found_start != start) { - printk(KERN_INFO "btrfs bad tree block start %llu %llu\n", - (unsigned long long)found_start, - (unsigned long long)eb->start); + if (printk_ratelimit()) { + printk(KERN_INFO "btrfs bad tree block start " + "%llu %llu\n", + (unsigned long long)found_start, + (unsigned long long)eb->start); + } ret = -EIO; goto err; } @@ -429,8 +439,10 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, goto err; } if (check_tree_block_fsid(root, eb)) { - printk(KERN_INFO "btrfs bad fsid on block %llu\n", - (unsigned long long)eb->start); + if (printk_ratelimit()) { + printk(KERN_INFO "btrfs bad fsid on block %llu\n", + (unsigned long long)eb->start); + } ret = -EIO; goto err; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2fdb299..552e08a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1823,10 +1823,12 @@ good: return 0; zeroit: - printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u " - "private %llu\n", page->mapping->host->i_ino, - (unsigned long long)start, csum, - (unsigned long long)private); + if (printk_ratelimit()) { + printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u " + "private %llu\n", page->mapping->host->i_ino, + (unsigned long long)start, csum, + (unsigned long long)private); + } memset(kaddr + offset, 1, end - start + 1); flush_dcache_page(page); kunmap_atomic(kaddr, KM_USER0); -- cgit v1.1 From 45c06543afe2772c02f21efee0e2138b4e1c911e Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 27 Apr 2009 07:49:10 -0400 Subject: Btrfs: remove unused btrfs_bit_radix slab Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 552e08a..98bd506 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -70,7 +70,6 @@ static struct extent_io_ops btrfs_extent_io_ops; static struct kmem_cache *btrfs_inode_cachep; struct kmem_cache *btrfs_trans_handle_cachep; struct kmem_cache *btrfs_transaction_cachep; -struct kmem_cache *btrfs_bit_radix_cachep; struct kmem_cache *btrfs_path_cachep; #define S_SHIFT 12 @@ -4641,8 +4640,6 @@ void btrfs_destroy_cachep(void) kmem_cache_destroy(btrfs_trans_handle_cachep); if (btrfs_transaction_cachep) kmem_cache_destroy(btrfs_transaction_cachep); - if (btrfs_bit_radix_cachep) - kmem_cache_destroy(btrfs_bit_radix_cachep); if (btrfs_path_cachep) kmem_cache_destroy(btrfs_path_cachep); } @@ -4673,11 +4670,6 @@ int btrfs_init_cachep(void) if (!btrfs_path_cachep) goto fail; - btrfs_bit_radix_cachep = kmem_cache_create("btrfs_radix", 256, 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | - SLAB_DESTROY_BY_RCU, NULL); - if (!btrfs_bit_radix_cachep) - goto fail; return 0; fail: btrfs_destroy_cachep(); -- cgit v1.1 From e63b6a6c0ffa2ebd8617cc1a10969000296831aa Mon Sep 17 00:00:00 2001 From: Joel Becker Date: Tue, 21 Apr 2009 12:38:30 -0700 Subject: Btrfs: Fix a trivial warning using max() of u64 vs ULL. A small warning popped up on ia64 because inode-map.c was comparing a u64 object id with the ULL FIRST_FREE_OBJECTID. My first thought was that all the OBJECTID constants should contain the u64 cast because btrfs code deals entirely in u64s. But then I saw how large that was, and figured I'd just fix the max() call. Signed-off-by: Joel Becker Signed-off-by: Chris Mason --- fs/btrfs/inode-map.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index cc7334d..9abbced 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -79,7 +79,7 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans, } path = btrfs_alloc_path(); BUG_ON(!path); - search_start = max(search_start, BTRFS_FIRST_FREE_OBJECTID); + search_start = max(search_start, (u64)BTRFS_FIRST_FREE_OBJECTID); search_key.objectid = search_start; search_key.type = 0; search_key.offset = 0; -- cgit v1.1 From 21380931eb4da4e29ac663d0221581282cbba208 Mon Sep 17 00:00:00 2001 From: Joel Becker Date: Tue, 21 Apr 2009 12:38:29 -0700 Subject: Btrfs: Fix a bunch of printk() warnings. Just happened to notice a bunch of %llu vs u64 warnings. Here's a patch to cast them all. Signed-off-by: Joel Becker Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 6 +++--- fs/btrfs/extent-tree.c | 21 ++++++++++++++------- fs/btrfs/free-space-cache.c | 15 ++++++++++----- fs/btrfs/ioctl.c | 6 ++++-- fs/btrfs/super.c | 15 +++++++++------ 5 files changed, 40 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index aa0c259..0ff16d3 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1671,7 +1671,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, if (features) { printk(KERN_ERR "BTRFS: couldn't mount because of " "unsupported optional features (%Lx).\n", - features); + (unsigned long long)features); err = -EINVAL; goto fail_iput; } @@ -1681,7 +1681,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, if (!(sb->s_flags & MS_RDONLY) && features) { printk(KERN_ERR "BTRFS: couldn't mount RDWR because of " "unsupported option features (%Lx).\n", - features); + (unsigned long long)features); err = -EINVAL; goto fail_iput; } @@ -2273,7 +2273,7 @@ int close_ctree(struct btrfs_root *root) if (fs_info->delalloc_bytes) { printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n", - fs_info->delalloc_bytes); + (unsigned long long)fs_info->delalloc_bytes); } if (fs_info->total_ref_cache_size) { printk(KERN_INFO "btrfs: at umount reference cache size %llu\n", diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 2895a83..e496644 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1844,10 +1844,14 @@ again: printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes" ", %llu bytes_used, %llu bytes_reserved, " "%llu bytes_pinned, %llu bytes_readonly, %llu may use" - "%llu total\n", bytes, data_sinfo->bytes_delalloc, - data_sinfo->bytes_used, data_sinfo->bytes_reserved, - data_sinfo->bytes_pinned, data_sinfo->bytes_readonly, - data_sinfo->bytes_may_use, data_sinfo->total_bytes); + "%llu total\n", (unsigned long long)bytes, + (unsigned long long)data_sinfo->bytes_delalloc, + (unsigned long long)data_sinfo->bytes_used, + (unsigned long long)data_sinfo->bytes_reserved, + (unsigned long long)data_sinfo->bytes_pinned, + (unsigned long long)data_sinfo->bytes_readonly, + (unsigned long long)data_sinfo->bytes_may_use, + (unsigned long long)data_sinfo->total_bytes); return -ENOSPC; } data_sinfo->bytes_may_use += bytes; @@ -2824,9 +2828,12 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes) info->bytes_pinned - info->bytes_reserved), (info->full) ? "" : "not "); printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu," - " may_use=%llu, used=%llu\n", info->total_bytes, - info->bytes_pinned, info->bytes_delalloc, info->bytes_may_use, - info->bytes_used); + " may_use=%llu, used=%llu\n", + (unsigned long long)info->total_bytes, + (unsigned long long)info->bytes_pinned, + (unsigned long long)info->bytes_delalloc, + (unsigned long long)info->bytes_may_use, + (unsigned long long)info->bytes_used); down_read(&info->groups_sem); list_for_each_entry(cache, &info->block_groups, list) { diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 768b952..0bc9365 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -332,13 +332,17 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, printk(KERN_ERR "couldn't find space %llu to free\n", (unsigned long long)offset); printk(KERN_ERR "cached is %d, offset %llu bytes %llu\n", - block_group->cached, block_group->key.objectid, - block_group->key.offset); + block_group->cached, + (unsigned long long)block_group->key.objectid, + (unsigned long long)block_group->key.offset); btrfs_dump_free_space(block_group, bytes); } else if (info) { printk(KERN_ERR "hmm, found offset=%llu bytes=%llu, " "but wanted offset=%llu bytes=%llu\n", - info->offset, info->bytes, offset, bytes); + (unsigned long long)info->offset, + (unsigned long long)info->bytes, + (unsigned long long)offset, + (unsigned long long)bytes); } WARN_ON(1); } @@ -357,8 +361,9 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, info = rb_entry(n, struct btrfs_free_space, offset_index); if (info->bytes >= bytes) count++; - printk(KERN_ERR "entry offset %llu, bytes %llu\n", info->offset, - info->bytes); + printk(KERN_ERR "entry offset %llu, bytes %llu\n", + (unsigned long long)info->offset, + (unsigned long long)info->bytes); } printk(KERN_INFO "%d blocks of free space at or bigger than bytes is" "\n", count); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index f4e5d2e..48762aa 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -483,11 +483,13 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg) *devstr = '\0'; devstr = vol_args->name; devid = simple_strtoull(devstr, &end, 10); - printk(KERN_INFO "resizing devid %llu\n", devid); + printk(KERN_INFO "resizing devid %llu\n", + (unsigned long long)devid); } device = btrfs_find_device(root, devid, NULL, NULL); if (!device) { - printk(KERN_INFO "resizer unable to find device %llu\n", devid); + printk(KERN_INFO "resizer unable to find device %llu\n", + (unsigned long long)devid); ret = -EINVAL; goto out_unlock; } diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 30c9a8c..bf0e84c 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -196,7 +196,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) info->max_extent = max_t(u64, info->max_extent, root->sectorsize); printk(KERN_INFO "btrfs: max_extent at %llu\n", - info->max_extent); + (unsigned long long)info->max_extent); } break; case Opt_max_inline: @@ -211,7 +211,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) root->sectorsize); } printk(KERN_INFO "btrfs: max_inline at %llu\n", - info->max_inline); + (unsigned long long)info->max_inline); } break; case Opt_alloc_start: @@ -221,7 +221,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) kfree(num); printk(KERN_INFO "btrfs: allocations start at %llu\n", - info->alloc_start); + (unsigned long long)info->alloc_start); } break; case Opt_noacl: @@ -420,11 +420,14 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs) if (btrfs_test_opt(root, NOBARRIER)) seq_puts(seq, ",nobarrier"); if (info->max_extent != (u64)-1) - seq_printf(seq, ",max_extent=%llu", info->max_extent); + seq_printf(seq, ",max_extent=%llu", + (unsigned long long)info->max_extent); if (info->max_inline != 8192 * 1024) - seq_printf(seq, ",max_inline=%llu", info->max_inline); + seq_printf(seq, ",max_inline=%llu", + (unsigned long long)info->max_inline); if (info->alloc_start != 0) - seq_printf(seq, ",alloc_start=%llu", info->alloc_start); + seq_printf(seq, ",alloc_start=%llu", + (unsigned long long)info->alloc_start); if (info->thread_pool_size != min_t(unsigned long, num_online_cpus() + 2, 8)) seq_printf(seq, ",thread_pool=%d", info->thread_pool_size); -- cgit v1.1 From fd1b52435a6d9663de896e8437ef067372916ef3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 7 Apr 2009 18:10:06 +0200 Subject: quota: remove obsolete comments in fs/quota/Makefile Get rid of useless comments and the equally useless obj-y initialization. Signed-off-by: Christoph Hellwig Signed-off-by: Jan Kara --- fs/quota/Makefile | 9 --------- 1 file changed, 9 deletions(-) (limited to 'fs') diff --git a/fs/quota/Makefile b/fs/quota/Makefile index 385a083..68d4f6d 100644 --- a/fs/quota/Makefile +++ b/fs/quota/Makefile @@ -1,12 +1,3 @@ -# -# Makefile for the Linux filesystems. -# -# 14 Sep 2000, Christoph Hellwig -# Rewritten to use lists instead of if-statements. -# - -obj-y := - obj-$(CONFIG_QUOTA) += dquot.o obj-$(CONFIG_QFMT_V1) += quota_v1.o obj-$(CONFIG_QFMT_V2) += quota_v2.o -- cgit v1.1 From a069e9cee1dba2f847839d325f46ce6976ed1b76 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 9 Apr 2009 18:07:10 +0200 Subject: ext2: missing unlock in ext2_quota_write() The inode->i_mutex should be unlocked. Found by smatch (http://repo.or.cz/w/smatch.git). Compile tested. Signed-off-by: Dan Carpenter Signed-off-by: Jan Kara --- fs/ext2/super.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext2/super.c b/fs/ext2/super.c index f983225..5c4afe6 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1395,8 +1395,10 @@ static ssize_t ext2_quota_write(struct super_block *sb, int type, blk++; } out: - if (len == towrite) + if (len == towrite) { + mutex_unlock(&inode->i_mutex); return err; + } if (inode->i_size < off+len-towrite) i_size_write(inode, off+len-towrite); inode->i_version++; -- cgit v1.1 From 7b1a14bbb0e547aaa4d30cc376e6c8c12539ab0f Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 27 Apr 2009 10:49:53 -0400 Subject: Btrfs: fix acl caching Linus noticed the btrfs code to cache acls wasn't properly caching a NULL acl when the inode didn't have any acls. This meant the common case of no acls resulted in expensive btree searches every time the kernel checked permissions (which is quite often). This is a modified version of Linus' original patch: Properly set initial acl fields to BTRFS_ACL_NOT_CACHED in the inode. This forces an acl lookup when permission checks are done. Fix btrfs_get_acl to avoid lookups and locking when the inode acls fields are set to null. Fix btrfs_get_acl to use the right return value from __btrfs_getxattr when deciding to cache a NULL acl. It was storing a NULL acl when __btrfs_getxattr return -ENOENT, but __btrfs_getxattr was actually returning -ENODATA for this case. Signed-off-by: Chris Mason --- fs/btrfs/acl.c | 18 +++++++++++++----- fs/btrfs/inode.c | 4 ++-- 2 files changed, 15 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 7fdd184..cbba000 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -60,15 +60,20 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) return ERR_PTR(-EINVAL); } + /* Handle the cached NULL acl case without locking */ + acl = ACCESS_ONCE(*p_acl); + if (!acl) + return acl; + spin_lock(&inode->i_lock); - if (*p_acl != BTRFS_ACL_NOT_CACHED) - acl = posix_acl_dup(*p_acl); + acl = *p_acl; + if (acl != BTRFS_ACL_NOT_CACHED) + acl = posix_acl_dup(acl); spin_unlock(&inode->i_lock); - if (acl) + if (acl != BTRFS_ACL_NOT_CACHED) return acl; - size = __btrfs_getxattr(inode, name, "", 0); if (size > 0) { value = kzalloc(size, GFP_NOFS); @@ -80,9 +85,12 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) btrfs_update_cached_acl(inode, p_acl, acl); } kfree(value); - } else if (size == -ENOENT) { + } else if (size == -ENOENT || size == -ENODATA || size == 0) { + /* FIXME, who returns -ENOENT? I think nobody */ acl = NULL; btrfs_update_cached_acl(inode, p_acl, acl); + } else { + acl = ERR_PTR(-EIO); } return acl; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 98bd506..dc812ec 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3047,8 +3047,8 @@ static noinline void init_btrfs_i(struct inode *inode) { struct btrfs_inode *bi = BTRFS_I(inode); - bi->i_acl = NULL; - bi->i_default_acl = NULL; + bi->i_acl = BTRFS_ACL_NOT_CACHED; + bi->i_default_acl = BTRFS_ACL_NOT_CACHED; bi->generation = 0; bi->sequence = 0; -- cgit v1.1 From 46a53cca826e71effe59e3cb4f383622c33ebdcb Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 27 Apr 2009 11:47:50 -0400 Subject: Btrfs: look for acls during btrfs_read_locked_inode This changes btrfs_read_locked_inode() to peek ahead in the btree for acl items. If it is certain a given inode has no acls, it will set the in memory acl fields to null to avoid acl lookups completely. Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index dc812ec..90c23eb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2016,6 +2016,57 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) } /* + * very simple check to peek ahead in the leaf looking for xattrs. If we + * don't find any xattrs, we know there can't be any acls. + * + * slot is the slot the inode is in, objectid is the objectid of the inode + */ +static noinline int acls_after_inode_item(struct extent_buffer *leaf, + int slot, u64 objectid) +{ + u32 nritems = btrfs_header_nritems(leaf); + struct btrfs_key found_key; + int scanned = 0; + + slot++; + while (slot < nritems) { + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + /* we found a different objectid, there must not be acls */ + if (found_key.objectid != objectid) + return 0; + + /* we found an xattr, assume we've got an acl */ + if (found_key.type == BTRFS_XATTR_ITEM_KEY) + return 1; + + /* + * we found a key greater than an xattr key, there can't + * be any acls later on + */ + if (found_key.type > BTRFS_XATTR_ITEM_KEY) + return 0; + + slot++; + scanned++; + + /* + * it goes inode, inode backrefs, xattrs, extents, + * so if there are a ton of hard links to an inode there can + * be a lot of backrefs. Don't waste time searching too hard, + * this is just an optimization + */ + if (scanned >= 8) + break; + } + /* we hit the end of the leaf before we found an xattr or + * something larger than an xattr. We have to assume the inode + * has acls + */ + return 1; +} + +/* * read an inode from the btree into the in-memory inode */ void btrfs_read_locked_inode(struct inode *inode) @@ -2026,6 +2077,7 @@ void btrfs_read_locked_inode(struct inode *inode) struct btrfs_timespec *tspec; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key location; + int maybe_acls; u64 alloc_group_block; u32 rdev; int ret; @@ -2072,6 +2124,16 @@ void btrfs_read_locked_inode(struct inode *inode) alloc_group_block = btrfs_inode_block_group(leaf, inode_item); + /* + * try to precache a NULL acl entry for files that don't have + * any xattrs or acls + */ + maybe_acls = acls_after_inode_item(leaf, path->slots[0], inode->i_ino); + if (!maybe_acls) { + BTRFS_I(inode)->i_acl = NULL; + BTRFS_I(inode)->i_default_acl = NULL; + } + BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0, alloc_group_block, 0); btrfs_free_path(path); -- cgit v1.1 From 69838727bcd819a8fd73a88447801221788b0c6d Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 28 Apr 2009 20:24:29 +0200 Subject: bio: fix memcpy corruption in bio_copy_user_iov() st driver uses blk_rq_map_user() in order to just build a request out of page frames. In this case, map_data->offset is a non zero value and iov[0].iov_base is NULL. We need to increase nr_pages for that. Cc: stable@kernel.org Signed-off-by: FUJITA Tomonori Signed-off-by: Jens Axboe --- fs/bio.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index 7bbc98f..9871164 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -817,6 +817,9 @@ struct bio *bio_copy_user_iov(struct request_queue *q, len += iov[i].iov_len; } + if (offset) + nr_pages++; + bmd = bio_alloc_map_data(nr_pages, iov_count, gfp_mask); if (!bmd) return ERR_PTR(-ENOMEM); -- cgit v1.1 From 802b352f2934f799ec2e159f61db6506094a936e Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 27 Apr 2009 21:24:28 -0700 Subject: ecryptfs: fix printk format warning fs/ecryptfs/inode.c:670: warning: format '%d' expects type 'int', but argument 3 has type 'size_t' Signed-off-by: Randy Dunlap Signed-off-by: Tyler Hicks Cc: Dustin Kirkland Signed-off-by: Andrew Morton --- fs/ecryptfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 93bc0f8..1940edd 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -667,7 +667,7 @@ ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz) lower_buf = kmalloc(lower_bufsiz, GFP_KERNEL); if (lower_buf == NULL) { printk(KERN_ERR "%s: Out of memory whilst attempting to " - "kmalloc [%d] bytes\n", __func__, lower_bufsiz); + "kmalloc [%zd] bytes\n", __func__, lower_bufsiz); rc = -ENOMEM; goto out; } -- cgit v1.1 From ac20100df7a7a042423dcb8847f42d9f6ddb8d00 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Mon, 27 Apr 2009 13:31:12 -0500 Subject: eCryptfs: Fix min function comparison warning This warning shows up on 64 bit builds: fs/ecryptfs/inode.c:693: warning: comparison of distinct pointer types lacks a cast Signed-off-by: Tyler Hicks --- fs/ecryptfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 1940edd..2f0945d 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -690,7 +690,7 @@ ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz) } /* Check for bufsiz <= 0 done in sys_readlinkat() */ rc = copy_to_user(buf, plaintext_name, - min((unsigned) bufsiz, plaintext_name_size)); + min((size_t) bufsiz, plaintext_name_size)); if (rc) rc = -EFAULT; else -- cgit v1.1