From 55708698c5f153f4e390175cdfc395333b2eafbd Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Tue, 16 Jul 2013 17:56:12 +0800 Subject: fs/anon_inode: Introduce a new lib function anon_inode_getfile_private() Introduce a new lib function anon_inode_getfile_private(), it creates a new file instance by hooking it up to an anonymous inode, and a dentry that describe the "class" of the file, similar to anon_inode_getfile(), but each file holds a single inode. Furthermore, anyone who wants to create a private anon file will benefit from this change. Signed-off-by: Gu Zheng Signed-off-by: Benjamin LaHaise --- fs/anon_inodes.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) (limited to 'fs') diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index 47a65df..85c9618 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -109,6 +109,72 @@ static struct file_system_type anon_inode_fs_type = { }; /** + * anon_inode_getfile_private - creates a new file instance by hooking it up to an + * anonymous inode, and a dentry that describe the "class" + * of the file + * + * @name: [in] name of the "class" of the new file + * @fops: [in] file operations for the new file + * @priv: [in] private data for the new file (will be file's private_data) + * @flags: [in] flags + * + * + * Similar to anon_inode_getfile, but each file holds a single inode. + * + */ +struct file *anon_inode_getfile_private(const char *name, + const struct file_operations *fops, + void *priv, int flags) +{ + struct qstr this; + struct path path; + struct file *file; + struct inode *inode; + + if (fops->owner && !try_module_get(fops->owner)) + return ERR_PTR(-ENOENT); + + inode = anon_inode_mkinode(anon_inode_mnt->mnt_sb); + if (IS_ERR(inode)) { + file = ERR_PTR(-ENOMEM); + goto err_module; + } + + /* + * Link the inode to a directory entry by creating a unique name + * using the inode sequence number. + */ + file = ERR_PTR(-ENOMEM); + this.name = name; + this.len = strlen(name); + this.hash = 0; + path.dentry = d_alloc_pseudo(anon_inode_mnt->mnt_sb, &this); + if (!path.dentry) + goto err_module; + + path.mnt = mntget(anon_inode_mnt); + + d_instantiate(path.dentry, inode); + + file = alloc_file(&path, OPEN_FMODE(flags), fops); + if (IS_ERR(file)) + goto err_dput; + + file->f_mapping = inode->i_mapping; + file->f_flags = flags & (O_ACCMODE | O_NONBLOCK); + file->private_data = priv; + + return file; + +err_dput: + path_put(&path); +err_module: + module_put(fops->owner); + return file; +} +EXPORT_SYMBOL_GPL(anon_inode_getfile_private); + +/** * anon_inode_getfile - creates a new file instance by hooking it up to an * anonymous inode, and a dentry that describe the "class" * of the file -- cgit v1.1 From 36bc08cc01709b4a9bb563b35aa530241ddc63e3 Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Tue, 16 Jul 2013 17:56:16 +0800 Subject: fs/aio: Add support to aio ring pages migration As the aio job will pin the ring pages, that will lead to mem migrated failed. In order to fix this problem we use an anon inode to manage the aio ring pages, and setup the migratepage callback in the anon inode's address space, so that when mem migrating the aio ring pages will be moved to other mem node safely. Signed-off-by: Gu Zheng Signed-off-by: Benjamin LaHaise --- fs/aio.c | 119 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 108 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 9b5ca11..cbd0afe 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -35,6 +35,9 @@ #include #include #include +#include +#include +#include #include #include @@ -110,6 +113,7 @@ struct kioctx { } ____cacheline_aligned_in_smp; struct page *internal_pages[AIO_RING_PAGES]; + struct file *aio_ring_file; }; /*------ sysctl variables----*/ @@ -138,15 +142,78 @@ __initcall(aio_setup); static void aio_free_ring(struct kioctx *ctx) { - long i; + int i; + struct file *aio_ring_file = ctx->aio_ring_file; - for (i = 0; i < ctx->nr_pages; i++) + for (i = 0; i < ctx->nr_pages; i++) { + pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i, + page_count(ctx->ring_pages[i])); put_page(ctx->ring_pages[i]); + } if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) kfree(ctx->ring_pages); + + if (aio_ring_file) { + truncate_setsize(aio_ring_file->f_inode, 0); + pr_debug("pid(%d) i_nlink=%u d_count=%d d_unhashed=%d i_count=%d\n", + current->pid, aio_ring_file->f_inode->i_nlink, + aio_ring_file->f_path.dentry->d_count, + d_unhashed(aio_ring_file->f_path.dentry), + atomic_read(&aio_ring_file->f_inode->i_count)); + fput(aio_ring_file); + ctx->aio_ring_file = NULL; + } } +static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma) +{ + vma->vm_ops = &generic_file_vm_ops; + return 0; +} + +static const struct file_operations aio_ring_fops = { + .mmap = aio_ring_mmap, +}; + +static int aio_set_page_dirty(struct page *page) +{ + return 0; +} + +static int aio_migratepage(struct address_space *mapping, struct page *new, + struct page *old, enum migrate_mode mode) +{ + struct kioctx *ctx = mapping->private_data; + unsigned long flags; + unsigned idx = old->index; + int rc; + + /* Writeback must be complete */ + BUG_ON(PageWriteback(old)); + put_page(old); + + rc = migrate_page_move_mapping(mapping, new, old, NULL, mode); + if (rc != MIGRATEPAGE_SUCCESS) { + get_page(old); + return rc; + } + + get_page(new); + + spin_lock_irqsave(&ctx->completion_lock, flags); + migrate_page_copy(new, old); + ctx->ring_pages[idx] = new; + spin_unlock_irqrestore(&ctx->completion_lock, flags); + + return rc; +} + +static const struct address_space_operations aio_ctx_aops = { + .set_page_dirty = aio_set_page_dirty, + .migratepage = aio_migratepage, +}; + static int aio_setup_ring(struct kioctx *ctx) { struct aio_ring *ring; @@ -154,20 +221,45 @@ static int aio_setup_ring(struct kioctx *ctx) struct mm_struct *mm = current->mm; unsigned long size, populate; int nr_pages; + int i; + struct file *file; /* Compensate for the ring buffer's head/tail overlap entry */ nr_events += 2; /* 1 is required, 2 for good luck */ size = sizeof(struct aio_ring); size += sizeof(struct io_event) * nr_events; - nr_pages = (size + PAGE_SIZE-1) >> PAGE_SHIFT; + nr_pages = PFN_UP(size); if (nr_pages < 0) return -EINVAL; - nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); + file = anon_inode_getfile_private("[aio]", &aio_ring_fops, ctx, O_RDWR); + if (IS_ERR(file)) { + ctx->aio_ring_file = NULL; + return -EAGAIN; + } + + file->f_inode->i_mapping->a_ops = &aio_ctx_aops; + file->f_inode->i_mapping->private_data = ctx; + file->f_inode->i_size = PAGE_SIZE * (loff_t)nr_pages; + + for (i = 0; i < nr_pages; i++) { + struct page *page; + page = find_or_create_page(file->f_inode->i_mapping, + i, GFP_HIGHUSER | __GFP_ZERO); + if (!page) + break; + pr_debug("pid(%d) page[%d]->count=%d\n", + current->pid, i, page_count(page)); + SetPageUptodate(page); + SetPageDirty(page); + unlock_page(page); + } + ctx->aio_ring_file = file; + nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) + / sizeof(struct io_event); - ctx->nr_events = 0; ctx->ring_pages = ctx->internal_pages; if (nr_pages > AIO_RING_PAGES) { ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *), @@ -178,28 +270,31 @@ static int aio_setup_ring(struct kioctx *ctx) ctx->mmap_size = nr_pages * PAGE_SIZE; pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size); + down_write(&mm->mmap_sem); - ctx->mmap_base = do_mmap_pgoff(NULL, 0, ctx->mmap_size, - PROT_READ|PROT_WRITE, - MAP_ANONYMOUS|MAP_PRIVATE, 0, &populate); + ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size, + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, 0, &populate); if (IS_ERR((void *)ctx->mmap_base)) { up_write(&mm->mmap_sem); ctx->mmap_size = 0; aio_free_ring(ctx); return -EAGAIN; } + up_write(&mm->mmap_sem); + + mm_populate(ctx->mmap_base, populate); pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base); ctx->nr_pages = get_user_pages(current, mm, ctx->mmap_base, nr_pages, 1, 0, ctx->ring_pages, NULL); - up_write(&mm->mmap_sem); + for (i = 0; i < ctx->nr_pages; i++) + put_page(ctx->ring_pages[i]); if (unlikely(ctx->nr_pages != nr_pages)) { aio_free_ring(ctx); return -EAGAIN; } - if (populate) - mm_populate(ctx->mmap_base, populate); ctx->user_id = ctx->mmap_base; ctx->nr_events = nr_events; /* trusted copy */ @@ -399,6 +494,8 @@ out_cleanup: err = -EAGAIN; aio_free_ring(ctx); out_freectx: + if (ctx->aio_ring_file) + fput(ctx->aio_ring_file); kmem_cache_free(kioctx_cachep, ctx); pr_debug("error allocating ioctx %d\n", err); return ERR_PTR(err); -- cgit v1.1 From 0c45355fc7c48c82db151bf0e7ca305d513e639e Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Wed, 17 Jul 2013 09:34:24 -0400 Subject: aio: fix build when migration is disabled When "fs/aio: Add support to aio ring pages migration" was applied, it broke the build when CONFIG_MIGRATION was disabled. Wrap the migration code with a test for CONFIG_MIGRATION to fix this and save a few bytes when migration is disabled. Signed-off-by: Benjamin LaHaise --- fs/aio.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index cbd0afe..dedeea0 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -181,6 +181,7 @@ static int aio_set_page_dirty(struct page *page) return 0; } +#if IS_ENABLED(CONFIG_MIGRATION) static int aio_migratepage(struct address_space *mapping, struct page *new, struct page *old, enum migrate_mode mode) { @@ -208,10 +209,13 @@ static int aio_migratepage(struct address_space *mapping, struct page *new, return rc; } +#endif static const struct address_space_operations aio_ctx_aops = { .set_page_dirty = aio_set_page_dirty, +#if IS_ENABLED(CONFIG_MIGRATION) .migratepage = aio_migratepage, +#endif }; static int aio_setup_ring(struct kioctx *ctx) -- cgit v1.1 From 34e83fc618085e00dc9803286c581f51966673bd Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 26 Apr 2013 10:58:39 +1000 Subject: aio: reqs_active -> reqs_available The number of outstanding kiocbs is one of the few shared things left that has to be touched for every kiocb - it'd be nice to make it percpu. We can make it per cpu by treating it like an allocation problem: we have a maximum number of kiocbs that can be outstanding (i.e. slots) - then we just allocate and free slots, and we know how to write per cpu allocators. So as prep work for that, we convert reqs_active to reqs_available. Signed-off-by: Kent Overstreet Cc: Zach Brown Cc: Felipe Balbi Cc: Greg Kroah-Hartman Cc: Mark Fasheh Cc: Joel Becker Cc: Rusty Russell Cc: Jens Axboe Cc: Asai Thambi S P Cc: Selvan Mani Cc: Sam Bradshaw Cc: Jeff Moyer Cc: Al Viro Cc: Benjamin LaHaise Reviewed-by: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Benjamin LaHaise --- fs/aio.c | 38 ++++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index dedeea0..0e23dfa 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -94,7 +94,13 @@ struct kioctx { struct work_struct rcu_work; struct { - atomic_t reqs_active; + /* + * This counts the number of available slots in the ringbuffer, + * so we avoid overflowing it: it's decremented (if positive) + * when allocating a kiocb and incremented when the resulting + * io_event is pulled off the ringbuffer. + */ + atomic_t reqs_available; } ____cacheline_aligned_in_smp; struct { @@ -404,19 +410,20 @@ static void free_ioctx(struct kioctx *ctx) head = ring->head; kunmap_atomic(ring); - while (atomic_read(&ctx->reqs_active) > 0) { + while (atomic_read(&ctx->reqs_available) < ctx->nr_events - 1) { wait_event(ctx->wait, - head != ctx->tail || - atomic_read(&ctx->reqs_active) <= 0); + (head != ctx->tail) || + (atomic_read(&ctx->reqs_available) >= + ctx->nr_events - 1)); avail = (head <= ctx->tail ? ctx->tail : ctx->nr_events) - head; - atomic_sub(avail, &ctx->reqs_active); + atomic_add(avail, &ctx->reqs_available); head += avail; head %= ctx->nr_events; } - WARN_ON(atomic_read(&ctx->reqs_active) < 0); + WARN_ON(atomic_read(&ctx->reqs_available) > ctx->nr_events - 1); aio_free_ring(ctx); @@ -475,6 +482,8 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) if (aio_setup_ring(ctx) < 0) goto out_freectx; + atomic_set(&ctx->reqs_available, ctx->nr_events - 1); + /* limit the number of system wide aios */ spin_lock(&aio_nr_lock); if (aio_nr + nr_events > aio_max_nr || @@ -586,7 +595,7 @@ void exit_aio(struct mm_struct *mm) "exit_aio:ioctx still alive: %d %d %d\n", atomic_read(&ctx->users), atomic_read(&ctx->dead), - atomic_read(&ctx->reqs_active)); + atomic_read(&ctx->reqs_available)); /* * We don't need to bother with munmap() here - * exit_mmap(mm) is coming and it'll unmap everything. @@ -615,12 +624,9 @@ static inline struct kiocb *aio_get_req(struct kioctx *ctx) { struct kiocb *req; - if (atomic_read(&ctx->reqs_active) >= ctx->nr_events) + if (atomic_dec_if_positive(&ctx->reqs_available) <= 0) return NULL; - if (atomic_inc_return(&ctx->reqs_active) > ctx->nr_events - 1) - goto out_put; - req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO); if (unlikely(!req)) goto out_put; @@ -630,7 +636,7 @@ static inline struct kiocb *aio_get_req(struct kioctx *ctx) return req; out_put: - atomic_dec(&ctx->reqs_active); + atomic_inc(&ctx->reqs_available); return NULL; } @@ -701,7 +707,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2) /* * Take rcu_read_lock() in case the kioctx is being destroyed, as we - * need to issue a wakeup after decrementing reqs_active. + * need to issue a wakeup after incrementing reqs_available. */ rcu_read_lock(); @@ -719,7 +725,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2) */ if (unlikely(xchg(&iocb->ki_cancel, KIOCB_CANCELLED) == KIOCB_CANCELLED)) { - atomic_dec(&ctx->reqs_active); + atomic_inc(&ctx->reqs_available); /* Still need the wake_up in case free_ioctx is waiting */ goto put_rq; } @@ -857,7 +863,7 @@ static long aio_read_events_ring(struct kioctx *ctx, pr_debug("%li h%u t%u\n", ret, head, ctx->tail); - atomic_sub(ret, &ctx->reqs_active); + atomic_add(ret, &ctx->reqs_available); out: mutex_unlock(&ctx->ring_lock); @@ -1241,7 +1247,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, aio_put_req(req); /* drop extra ref to req */ return 0; out_put_req: - atomic_dec(&ctx->reqs_active); + atomic_inc(&ctx->reqs_available); aio_put_req(req); /* drop extra ref to req */ aio_put_req(req); /* drop i/o ref to req */ return ret; -- cgit v1.1 From e1bdd5f27a5b14e24a658d5511bebceb67679d83 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 26 Apr 2013 10:58:39 +1000 Subject: aio: percpu reqs_available See the previous patch ("aio: reqs_active -> reqs_available") for why we want to do this - this basically implements a per cpu allocator for reqs_available that doesn't actually allocate anything. Note that we need to increase the size of the ringbuffer we allocate, since a single thread won't necessarily be able to use all the reqs_available slots - some (up to about half) might be on other per cpu lists, unavailable for the current thread. We size the ringbuffer based on the nr_events userspace passed to io_setup(), so this is a slight behaviour change - but nr_events wasn't being used as a hard limit before, it was being rounded up to the next page before so this doesn't change the actual semantics. Signed-off-by: Kent Overstreet Cc: Zach Brown Cc: Felipe Balbi Cc: Greg Kroah-Hartman Cc: Mark Fasheh Cc: Joel Becker Cc: Rusty Russell Cc: Jens Axboe Cc: Asai Thambi S P Cc: Selvan Mani Cc: Sam Bradshaw Cc: Jeff Moyer Cc: Al Viro Cc: Benjamin LaHaise Reviewed-by: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Benjamin LaHaise --- fs/aio.c | 106 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 99 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 0e23dfa..bb1a6c4 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -64,6 +65,10 @@ struct aio_ring { #define AIO_RING_PAGES 8 +struct kioctx_cpu { + unsigned reqs_available; +}; + struct kioctx { atomic_t users; atomic_t dead; @@ -72,6 +77,13 @@ struct kioctx { unsigned long user_id; struct hlist_node list; + struct __percpu kioctx_cpu *cpu; + + /* + * For percpu reqs_available, number of slots we move to/from global + * counter at a time: + */ + unsigned req_batch; /* * This is what userspace passed to io_setup(), it's not used for * anything but counting against the global max_reqs quota. @@ -99,6 +111,8 @@ struct kioctx { * so we avoid overflowing it: it's decremented (if positive) * when allocating a kiocb and incremented when the resulting * io_event is pulled off the ringbuffer. + * + * We batch accesses to it with a percpu version. */ atomic_t reqs_available; } ____cacheline_aligned_in_smp; @@ -379,6 +393,8 @@ static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb, static void free_ioctx_rcu(struct rcu_head *head) { struct kioctx *ctx = container_of(head, struct kioctx, rcu_head); + + free_percpu(ctx->cpu); kmem_cache_free(kioctx_cachep, ctx); } @@ -392,7 +408,7 @@ static void free_ioctx(struct kioctx *ctx) struct aio_ring *ring; struct io_event res; struct kiocb *req; - unsigned head, avail; + unsigned cpu, head, avail; spin_lock_irq(&ctx->ctx_lock); @@ -406,6 +422,13 @@ static void free_ioctx(struct kioctx *ctx) spin_unlock_irq(&ctx->ctx_lock); + for_each_possible_cpu(cpu) { + struct kioctx_cpu *kcpu = per_cpu_ptr(ctx->cpu, cpu); + + atomic_add(kcpu->reqs_available, &ctx->reqs_available); + kcpu->reqs_available = 0; + } + ring = kmap_atomic(ctx->ring_pages[0]); head = ring->head; kunmap_atomic(ring); @@ -454,6 +477,18 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) struct kioctx *ctx; int err = -ENOMEM; + /* + * We keep track of the number of available ringbuffer slots, to prevent + * overflow (reqs_available), and we also use percpu counters for this. + * + * So since up to half the slots might be on other cpu's percpu counters + * and unavailable, double nr_events so userspace sees what they + * expected: additionally, we move req_batch slots to/from percpu + * counters at a time, so make sure that isn't 0: + */ + nr_events = max(nr_events, num_possible_cpus() * 4); + nr_events *= 2; + /* Prevent overflows */ if ((nr_events > (0x10000000U / sizeof(struct io_event))) || (nr_events > (0x10000000U / sizeof(struct kiocb)))) { @@ -479,10 +514,16 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) INIT_LIST_HEAD(&ctx->active_reqs); - if (aio_setup_ring(ctx) < 0) + ctx->cpu = alloc_percpu(struct kioctx_cpu); + if (!ctx->cpu) goto out_freectx; + if (aio_setup_ring(ctx) < 0) + goto out_freepcpu; + atomic_set(&ctx->reqs_available, ctx->nr_events - 1); + ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4); + BUG_ON(!ctx->req_batch); /* limit the number of system wide aios */ spin_lock(&aio_nr_lock); @@ -506,6 +547,8 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) out_cleanup: err = -EAGAIN; aio_free_ring(ctx); +out_freepcpu: + free_percpu(ctx->cpu); out_freectx: if (ctx->aio_ring_file) fput(ctx->aio_ring_file); @@ -610,6 +653,52 @@ void exit_aio(struct mm_struct *mm) } } +static void put_reqs_available(struct kioctx *ctx, unsigned nr) +{ + struct kioctx_cpu *kcpu; + + preempt_disable(); + kcpu = this_cpu_ptr(ctx->cpu); + + kcpu->reqs_available += nr; + while (kcpu->reqs_available >= ctx->req_batch * 2) { + kcpu->reqs_available -= ctx->req_batch; + atomic_add(ctx->req_batch, &ctx->reqs_available); + } + + preempt_enable(); +} + +static bool get_reqs_available(struct kioctx *ctx) +{ + struct kioctx_cpu *kcpu; + bool ret = false; + + preempt_disable(); + kcpu = this_cpu_ptr(ctx->cpu); + + if (!kcpu->reqs_available) { + int old, avail = atomic_read(&ctx->reqs_available); + + do { + if (avail < ctx->req_batch) + goto out; + + old = avail; + avail = atomic_cmpxchg(&ctx->reqs_available, + avail, avail - ctx->req_batch); + } while (avail != old); + + kcpu->reqs_available += ctx->req_batch; + } + + ret = true; + kcpu->reqs_available--; +out: + preempt_enable(); + return ret; +} + /* aio_get_req * Allocate a slot for an aio request. Increments the ki_users count * of the kioctx so that the kioctx stays around until all requests are @@ -624,7 +713,7 @@ static inline struct kiocb *aio_get_req(struct kioctx *ctx) { struct kiocb *req; - if (atomic_dec_if_positive(&ctx->reqs_available) <= 0) + if (!get_reqs_available(ctx)) return NULL; req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO); @@ -633,10 +722,9 @@ static inline struct kiocb *aio_get_req(struct kioctx *ctx) atomic_set(&req->ki_users, 2); req->ki_ctx = ctx; - return req; out_put: - atomic_inc(&ctx->reqs_available); + put_reqs_available(ctx, 1); return NULL; } @@ -725,6 +813,10 @@ void aio_complete(struct kiocb *iocb, long res, long res2) */ if (unlikely(xchg(&iocb->ki_cancel, KIOCB_CANCELLED) == KIOCB_CANCELLED)) { + /* + * Can't use the percpu reqs_available here - could race with + * free_ioctx() + */ atomic_inc(&ctx->reqs_available); /* Still need the wake_up in case free_ioctx is waiting */ goto put_rq; @@ -863,7 +955,7 @@ static long aio_read_events_ring(struct kioctx *ctx, pr_debug("%li h%u t%u\n", ret, head, ctx->tail); - atomic_add(ret, &ctx->reqs_available); + put_reqs_available(ctx, ret); out: mutex_unlock(&ctx->ring_lock); @@ -1247,7 +1339,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, aio_put_req(req); /* drop extra ref to req */ return 0; out_put_req: - atomic_inc(&ctx->reqs_available); + put_reqs_available(ctx, 1); aio_put_req(req); /* drop extra ref to req */ aio_put_req(req); /* drop i/o ref to req */ return ret; -- cgit v1.1 From 723be6e39d14254bb5bb9f422b434566d359fa6e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 28 May 2013 15:14:48 -0700 Subject: aio: percpu ioctx refcount This just converts the ioctx refcount to the new generic dynamic percpu refcount code. Signed-off-by: Kent Overstreet Cc: Zach Brown Cc: Felipe Balbi Cc: Greg Kroah-Hartman Cc: Mark Fasheh Cc: Joel Becker Cc: Rusty Russell Cc: Jens Axboe Cc: Asai Thambi S P Cc: Selvan Mani Cc: Sam Bradshaw Cc: Jeff Moyer Cc: Al Viro Cc: Benjamin LaHaise Reviewed-by: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Benjamin LaHaise --- fs/aio.c | 66 ++++++++++++++++++++++++++-------------------------------------- 1 file changed, 27 insertions(+), 39 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index bb1a6c4..7b470bf 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -70,7 +71,7 @@ struct kioctx_cpu { }; struct kioctx { - atomic_t users; + struct percpu_ref users; atomic_t dead; /* This needs improving */ @@ -103,7 +104,7 @@ struct kioctx { long nr_pages; struct rcu_head rcu_head; - struct work_struct rcu_work; + struct work_struct free_work; struct { /* @@ -403,8 +404,9 @@ static void free_ioctx_rcu(struct rcu_head *head) * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted - * now it's safe to cancel any that need to be. */ -static void free_ioctx(struct kioctx *ctx) +static void free_ioctx(struct work_struct *work) { + struct kioctx *ctx = container_of(work, struct kioctx, free_work); struct aio_ring *ring; struct io_event res; struct kiocb *req; @@ -462,10 +464,12 @@ static void free_ioctx(struct kioctx *ctx) call_rcu(&ctx->rcu_head, free_ioctx_rcu); } -static void put_ioctx(struct kioctx *ctx) +static void free_ioctx_ref(struct percpu_ref *ref) { - if (unlikely(atomic_dec_and_test(&ctx->users))) - free_ioctx(ctx); + struct kioctx *ctx = container_of(ref, struct kioctx, users); + + INIT_WORK(&ctx->free_work, free_ioctx); + schedule_work(&ctx->free_work); } /* ioctx_alloc @@ -505,8 +509,9 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) ctx->max_reqs = nr_events; - atomic_set(&ctx->users, 2); - atomic_set(&ctx->dead, 0); + if (percpu_ref_init(&ctx->users, free_ioctx_ref)) + goto out_freectx; + spin_lock_init(&ctx->ctx_lock); spin_lock_init(&ctx->completion_lock); mutex_init(&ctx->ring_lock); @@ -516,7 +521,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) ctx->cpu = alloc_percpu(struct kioctx_cpu); if (!ctx->cpu) - goto out_freectx; + goto out_freeref; if (aio_setup_ring(ctx) < 0) goto out_freepcpu; @@ -535,6 +540,8 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) aio_nr += ctx->max_reqs; spin_unlock(&aio_nr_lock); + percpu_ref_get(&ctx->users); /* io_setup() will drop this ref */ + /* now link into global list. */ spin_lock(&mm->ioctx_lock); hlist_add_head_rcu(&ctx->list, &mm->ioctx_list); @@ -549,6 +556,8 @@ out_cleanup: aio_free_ring(ctx); out_freepcpu: free_percpu(ctx->cpu); +out_freeref: + free_percpu(ctx->users.pcpu_count); out_freectx: if (ctx->aio_ring_file) fput(ctx->aio_ring_file); @@ -557,22 +566,6 @@ out_freectx: return ERR_PTR(err); } -static void kill_ioctx_work(struct work_struct *work) -{ - struct kioctx *ctx = container_of(work, struct kioctx, rcu_work); - - wake_up_all(&ctx->wait); - put_ioctx(ctx); -} - -static void kill_ioctx_rcu(struct rcu_head *head) -{ - struct kioctx *ctx = container_of(head, struct kioctx, rcu_head); - - INIT_WORK(&ctx->rcu_work, kill_ioctx_work); - schedule_work(&ctx->rcu_work); -} - /* kill_ioctx * Cancels all outstanding aio requests on an aio context. Used * when the processes owning a context have all exited to encourage @@ -582,6 +575,8 @@ static void kill_ioctx(struct kioctx *ctx) { if (!atomic_xchg(&ctx->dead, 1)) { hlist_del_rcu(&ctx->list); + /* percpu_ref_kill() will do the necessary call_rcu() */ + wake_up_all(&ctx->wait); /* * It'd be more correct to do this in free_ioctx(), after all @@ -598,8 +593,7 @@ static void kill_ioctx(struct kioctx *ctx) if (ctx->mmap_size) vm_munmap(ctx->mmap_base, ctx->mmap_size); - /* Between hlist_del_rcu() and dropping the initial ref */ - call_rcu(&ctx->rcu_head, kill_ioctx_rcu); + percpu_ref_kill(&ctx->users); } } @@ -633,12 +627,6 @@ void exit_aio(struct mm_struct *mm) struct hlist_node *n; hlist_for_each_entry_safe(ctx, n, &mm->ioctx_list, list) { - if (1 != atomic_read(&ctx->users)) - printk(KERN_DEBUG - "exit_aio:ioctx still alive: %d %d %d\n", - atomic_read(&ctx->users), - atomic_read(&ctx->dead), - atomic_read(&ctx->reqs_available)); /* * We don't need to bother with munmap() here - * exit_mmap(mm) is coming and it'll unmap everything. @@ -757,7 +745,7 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) hlist_for_each_entry_rcu(ctx, &mm->ioctx_list, list) { if (ctx->user_id == ctx_id) { - atomic_inc(&ctx->users); + percpu_ref_get(&ctx->users); ret = ctx; break; } @@ -1054,7 +1042,7 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp) ret = put_user(ioctx->user_id, ctxp); if (ret) kill_ioctx(ioctx); - put_ioctx(ioctx); + percpu_ref_put(&ioctx->users); } out: @@ -1072,7 +1060,7 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) struct kioctx *ioctx = lookup_ioctx(ctx); if (likely(NULL != ioctx)) { kill_ioctx(ioctx); - put_ioctx(ioctx); + percpu_ref_put(&ioctx->users); return 0; } pr_debug("EINVAL: io_destroy: invalid context id\n"); @@ -1394,7 +1382,7 @@ long do_io_submit(aio_context_t ctx_id, long nr, } blk_finish_plug(&plug); - put_ioctx(ctx); + percpu_ref_put(&ctx->users); return i ? i : ret; } @@ -1483,7 +1471,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, ret = -EFAULT; } - put_ioctx(ctx); + percpu_ref_put(&ctx->users); return ret; } @@ -1512,7 +1500,7 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id, if (likely(ioctx)) { if (likely(min_nr <= nr && min_nr >= 0)) ret = read_events(ioctx, min_nr, nr, events, timeout); - put_ioctx(ioctx); + percpu_ref_put(&ioctx->users); } return ret; } -- cgit v1.1 From bec68faaf3ba74ed0dcd5dc3a881b30aec542973 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 13 May 2013 14:45:08 -0700 Subject: aio: io_cancel() no longer returns the io_event Originally, io_event() was documented to return the io_event if cancellation succeeded - the io_event wouldn't be delivered via the ring buffer like it normally would. But this isn't what the implementation was actually doing; the only driver implementing cancellation, the usb gadget code, never returned an io_event in its cancel function. And aio_complete() was recently changed to no longer suppress event delivery if the kiocb had been cancelled. This gets rid of the unused io_event argument to kiocb_cancel() and kiocb->ki_cancel(), and changes io_cancel() to return -EINPROGRESS if kiocb->ki_cancel() returned success. Also tweak the refcounting in kiocb_cancel() to make more sense. Signed-off-by: Kent Overstreet Cc: Zach Brown Cc: Felipe Balbi Cc: Greg Kroah-Hartman Cc: Mark Fasheh Cc: Joel Becker Cc: Rusty Russell Cc: Jens Axboe Cc: Asai Thambi S P Cc: Selvan Mani Cc: Sam Bradshaw Cc: Jeff Moyer Cc: Al Viro Cc: Benjamin LaHaise Signed-off-by: Benjamin LaHaise --- fs/aio.c | 40 ++++++++++------------------------------ 1 file changed, 10 insertions(+), 30 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 7b470bf..12b3768 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -358,8 +358,7 @@ void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel) } EXPORT_SYMBOL(kiocb_set_cancel_fn); -static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb, - struct io_event *res) +static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb) { kiocb_cancel_fn *old, *cancel; int ret = -EINVAL; @@ -381,12 +380,10 @@ static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb, atomic_inc(&kiocb->ki_users); spin_unlock_irq(&ctx->ctx_lock); - memset(res, 0, sizeof(*res)); - res->obj = (u64)(unsigned long)kiocb->ki_obj.user; - res->data = kiocb->ki_user_data; - ret = cancel(kiocb, res); + ret = cancel(kiocb); spin_lock_irq(&ctx->ctx_lock); + aio_put_req(kiocb); return ret; } @@ -408,7 +405,6 @@ static void free_ioctx(struct work_struct *work) { struct kioctx *ctx = container_of(work, struct kioctx, free_work); struct aio_ring *ring; - struct io_event res; struct kiocb *req; unsigned cpu, head, avail; @@ -419,7 +415,7 @@ static void free_ioctx(struct work_struct *work) struct kiocb, ki_list); list_del_init(&req->ki_list); - kiocb_cancel(ctx, req, &res); + kiocb_cancel(ctx, req); } spin_unlock_irq(&ctx->ctx_lock); @@ -796,21 +792,6 @@ void aio_complete(struct kiocb *iocb, long res, long res2) } /* - * cancelled requests don't get events, userland was given one - * when the event got cancelled. - */ - if (unlikely(xchg(&iocb->ki_cancel, - KIOCB_CANCELLED) == KIOCB_CANCELLED)) { - /* - * Can't use the percpu reqs_available here - could race with - * free_ioctx() - */ - atomic_inc(&ctx->reqs_available); - /* Still need the wake_up in case free_ioctx is waiting */ - goto put_rq; - } - - /* * Add a completion event to the ring buffer. Must be done holding * ctx->completion_lock to prevent other code from messing with the tail * pointer since we might be called from irq context. @@ -862,7 +843,6 @@ void aio_complete(struct kiocb *iocb, long res, long res2) if (iocb->ki_eventfd != NULL) eventfd_signal(iocb->ki_eventfd, 1); -put_rq: /* everything turned out well, dispose of the aiocb. */ aio_put_req(iocb); @@ -1439,7 +1419,6 @@ static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, struct io_event __user *, result) { - struct io_event res; struct kioctx *ctx; struct kiocb *kiocb; u32 key; @@ -1457,18 +1436,19 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, kiocb = lookup_kiocb(ctx, iocb, key); if (kiocb) - ret = kiocb_cancel(ctx, kiocb, &res); + ret = kiocb_cancel(ctx, kiocb); else ret = -EINVAL; spin_unlock_irq(&ctx->ctx_lock); if (!ret) { - /* Cancellation succeeded -- copy the result - * into the user's buffer. + /* + * The result argument is no longer used - the io_event is + * always delivered via the ring buffer. -EINPROGRESS indicates + * cancellation is progress: */ - if (copy_to_user(result, &res, sizeof(res))) - ret = -EFAULT; + ret = -EINPROGRESS; } percpu_ref_put(&ctx->users); -- cgit v1.1 From 5ffac122dbda89fbb29885f35a5d47b0edb8936d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 9 May 2013 15:36:07 -0700 Subject: aio: Don't use ctx->tail unnecessarily aio_complete() (arguably) needs to keep its own trusted copy of the tail pointer, but io_getevents() doesn't have to use it - it's already using the head pointer from the ring buffer. So convert it to use the tail from the ring buffer so it touches fewer cachelines and doesn't contend with the cacheline aio_complete() needs. Signed-off-by: Kent Overstreet Cc: Zach Brown Cc: Felipe Balbi Cc: Greg Kroah-Hartman Cc: Mark Fasheh Cc: Joel Becker Cc: Rusty Russell Cc: Jens Axboe Cc: Asai Thambi S P Cc: Selvan Mani Cc: Sam Bradshaw Cc: Jeff Moyer Cc: Al Viro Cc: Benjamin LaHaise Signed-off-by: Benjamin LaHaise --- fs/aio.c | 42 +++++++++++++++++++++++------------------- 1 file changed, 23 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 12b3768..57b0279 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -406,7 +406,8 @@ static void free_ioctx(struct work_struct *work) struct kioctx *ctx = container_of(work, struct kioctx, free_work); struct aio_ring *ring; struct kiocb *req; - unsigned cpu, head, avail; + unsigned cpu, avail; + DEFINE_WAIT(wait); spin_lock_irq(&ctx->ctx_lock); @@ -427,22 +428,24 @@ static void free_ioctx(struct work_struct *work) kcpu->reqs_available = 0; } - ring = kmap_atomic(ctx->ring_pages[0]); - head = ring->head; - kunmap_atomic(ring); + while (1) { + prepare_to_wait(&ctx->wait, &wait, TASK_UNINTERRUPTIBLE); - while (atomic_read(&ctx->reqs_available) < ctx->nr_events - 1) { - wait_event(ctx->wait, - (head != ctx->tail) || - (atomic_read(&ctx->reqs_available) >= - ctx->nr_events - 1)); - - avail = (head <= ctx->tail ? ctx->tail : ctx->nr_events) - head; + ring = kmap_atomic(ctx->ring_pages[0]); + avail = (ring->head <= ring->tail) + ? ring->tail - ring->head + : ctx->nr_events - ring->head + ring->tail; atomic_add(avail, &ctx->reqs_available); - head += avail; - head %= ctx->nr_events; + ring->head = ring->tail; + kunmap_atomic(ring); + + if (atomic_read(&ctx->reqs_available) >= ctx->nr_events - 1) + break; + + schedule(); } + finish_wait(&ctx->wait, &wait); WARN_ON(atomic_read(&ctx->reqs_available) > ctx->nr_events - 1); @@ -869,7 +872,7 @@ static long aio_read_events_ring(struct kioctx *ctx, struct io_event __user *event, long nr) { struct aio_ring *ring; - unsigned head, pos; + unsigned head, tail, pos; long ret = 0; int copy_ret; @@ -877,11 +880,12 @@ static long aio_read_events_ring(struct kioctx *ctx, ring = kmap_atomic(ctx->ring_pages[0]); head = ring->head; + tail = ring->tail; kunmap_atomic(ring); - pr_debug("h%u t%u m%u\n", head, ctx->tail, ctx->nr_events); + pr_debug("h%u t%u m%u\n", head, tail, ctx->nr_events); - if (head == ctx->tail) + if (head == tail) goto out; while (ret < nr) { @@ -889,8 +893,8 @@ static long aio_read_events_ring(struct kioctx *ctx, struct io_event *ev; struct page *page; - avail = (head <= ctx->tail ? ctx->tail : ctx->nr_events) - head; - if (head == ctx->tail) + avail = (head <= tail ? tail : ctx->nr_events) - head; + if (head == tail) break; avail = min(avail, nr - ret); @@ -921,7 +925,7 @@ static long aio_read_events_ring(struct kioctx *ctx, kunmap_atomic(ring); flush_dcache_page(ctx->ring_pages[0]); - pr_debug("%li h%u t%u\n", ret, head, ctx->tail); + pr_debug("%li h%u t%u\n", ret, head, tail); put_reqs_available(ctx, ret); out: -- cgit v1.1 From 73a7075e3f6ec63dc359064eea6fd84f406cf2a5 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 9 May 2013 15:03:42 -0700 Subject: aio: Kill aio_rw_vect_retry() This code doesn't serve any purpose anymore, since the aio retry infrastructure has been removed. This change should be safe because aio_read/write are also used for synchronous IO, and called from do_sync_read()/do_sync_write() - and there's no looping done in the sync case (the read and write syscalls). Signed-off-by: Kent Overstreet Cc: Zach Brown Cc: Felipe Balbi Cc: Greg Kroah-Hartman Cc: Mark Fasheh Cc: Joel Becker Cc: Rusty Russell Cc: Jens Axboe Cc: Asai Thambi S P Cc: Selvan Mani Cc: Sam Bradshaw Cc: Jeff Moyer Cc: Al Viro Cc: Benjamin LaHaise Signed-off-by: Benjamin LaHaise --- fs/aio.c | 91 ++++++++++++--------------------------------------------- fs/block_dev.c | 2 +- fs/nfs/direct.c | 1 - fs/ocfs2/file.c | 6 ++-- fs/read_write.c | 3 -- fs/udf/file.c | 2 +- 6 files changed, 23 insertions(+), 82 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 57b0279..d00904d 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -707,7 +707,7 @@ static inline struct kiocb *aio_get_req(struct kioctx *ctx) if (unlikely(!req)) goto out_put; - atomic_set(&req->ki_users, 2); + atomic_set(&req->ki_users, 1); req->ki_ctx = ctx; return req; out_put: @@ -1051,75 +1051,9 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) return -EINVAL; } -static void aio_advance_iovec(struct kiocb *iocb, ssize_t ret) -{ - struct iovec *iov = &iocb->ki_iovec[iocb->ki_cur_seg]; - - BUG_ON(ret <= 0); - - while (iocb->ki_cur_seg < iocb->ki_nr_segs && ret > 0) { - ssize_t this = min((ssize_t)iov->iov_len, ret); - iov->iov_base += this; - iov->iov_len -= this; - iocb->ki_left -= this; - ret -= this; - if (iov->iov_len == 0) { - iocb->ki_cur_seg++; - iov++; - } - } - - /* the caller should not have done more io than what fit in - * the remaining iovecs */ - BUG_ON(ret > 0 && iocb->ki_left == 0); -} - typedef ssize_t (aio_rw_op)(struct kiocb *, const struct iovec *, unsigned long, loff_t); -static ssize_t aio_rw_vect_retry(struct kiocb *iocb, int rw, aio_rw_op *rw_op) -{ - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - ssize_t ret = 0; - - /* This matches the pread()/pwrite() logic */ - if (iocb->ki_pos < 0) - return -EINVAL; - - if (rw == WRITE) - file_start_write(file); - do { - ret = rw_op(iocb, &iocb->ki_iovec[iocb->ki_cur_seg], - iocb->ki_nr_segs - iocb->ki_cur_seg, - iocb->ki_pos); - if (ret > 0) - aio_advance_iovec(iocb, ret); - - /* retry all partial writes. retry partial reads as long as its a - * regular file. */ - } while (ret > 0 && iocb->ki_left > 0 && - (rw == WRITE || - (!S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode)))); - if (rw == WRITE) - file_end_write(file); - - /* This means we must have transferred all that we could */ - /* No need to retry anymore */ - if ((ret == 0) || (iocb->ki_left == 0)) - ret = iocb->ki_nbytes - iocb->ki_left; - - /* If we managed to write some out we return that, rather than - * the eventual error. */ - if (rw == WRITE - && ret < 0 && ret != -EIOCBQUEUED - && iocb->ki_nbytes - iocb->ki_left) - ret = iocb->ki_nbytes - iocb->ki_left; - - return ret; -} - static ssize_t aio_setup_vectored_rw(int rw, struct kiocb *kiocb, bool compat) { ssize_t ret; @@ -1204,9 +1138,22 @@ rw_common: return ret; req->ki_nbytes = ret; - req->ki_left = ret; - ret = aio_rw_vect_retry(req, rw, rw_op); + /* XXX: move/kill - rw_verify_area()? */ + /* This matches the pread()/pwrite() logic */ + if (req->ki_pos < 0) { + ret = -EINVAL; + break; + } + + if (rw == WRITE) + file_start_write(file); + + ret = rw_op(req, req->ki_iovec, + req->ki_nr_segs, req->ki_pos); + + if (rw == WRITE) + file_end_write(file); break; case IOCB_CMD_FDSYNC: @@ -1301,19 +1248,17 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, req->ki_pos = iocb->aio_offset; req->ki_buf = (char __user *)(unsigned long)iocb->aio_buf; - req->ki_left = req->ki_nbytes = iocb->aio_nbytes; + req->ki_nbytes = iocb->aio_nbytes; req->ki_opcode = iocb->aio_lio_opcode; ret = aio_run_iocb(req, compat); if (ret) goto out_put_req; - aio_put_req(req); /* drop extra ref to req */ return 0; out_put_req: put_reqs_available(ctx, 1); - aio_put_req(req); /* drop extra ref to req */ - aio_put_req(req); /* drop i/o ref to req */ + aio_put_req(req); return ret; } diff --git a/fs/block_dev.c b/fs/block_dev.c index c7bda5c..8772b15 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1542,7 +1542,7 @@ static ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov, return 0; size -= pos; - if (size < iocb->ki_left) + if (size < iocb->ki_nbytes) nr_segs = iov_shorten((struct iovec *)iov, nr_segs, size); return generic_file_aio_read(iocb, iov, nr_segs, pos); } diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 0bd7a55..91ff089 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -130,7 +130,6 @@ ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_ return -EINVAL; #else - VM_BUG_ON(iocb->ki_left != PAGE_SIZE); VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE); if (rw == READ || rw == KERNEL_READ) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 41000f2..dd1a490 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2245,7 +2245,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, file->f_path.dentry->d_name.name, (unsigned int)nr_segs); - if (iocb->ki_left == 0) + if (iocb->ki_nbytes == 0) return 0; appending = file->f_flags & O_APPEND ? 1 : 0; @@ -2296,7 +2296,7 @@ relock: can_do_direct = direct_io; ret = ocfs2_prepare_inode_for_write(file, ppos, - iocb->ki_left, appending, + iocb->ki_nbytes, appending, &can_do_direct, &has_refcount); if (ret < 0) { mlog_errno(ret); @@ -2304,7 +2304,7 @@ relock: } if (direct_io && !is_sync_kiocb(iocb)) - unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_left, + unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_nbytes, *ppos); /* diff --git a/fs/read_write.c b/fs/read_write.c index 122a384..e3cd280 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -367,7 +367,6 @@ ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *pp init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = *ppos; - kiocb.ki_left = len; kiocb.ki_nbytes = len; ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos); @@ -417,7 +416,6 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = *ppos; - kiocb.ki_left = len; kiocb.ki_nbytes = len; ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos); @@ -599,7 +597,6 @@ static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov, init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = *ppos; - kiocb.ki_left = len; kiocb.ki_nbytes = len; ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos); diff --git a/fs/udf/file.c b/fs/udf/file.c index 29569dd..c02a27a 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -141,7 +141,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov, struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); int err, pos; - size_t count = iocb->ki_left; + size_t count = iocb->ki_nbytes; struct udf_inode_info *iinfo = UDF_I(inode); down_write(&iinfo->i_data_sem); -- cgit v1.1 From 8bc92afcf7f5c598001dd04e62d88f57f6e89e51 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 25 Feb 2013 16:36:27 -0800 Subject: aio: Kill unneeded kiocb members The old aio retry infrastucture needed to save the various arguments to to aio operations. But with the retry infrastructure gone, we can trim struct kiocb quite a bit. Signed-off-by: Kent Overstreet Cc: Zach Brown Cc: Felipe Balbi Cc: Greg Kroah-Hartman Cc: Mark Fasheh Cc: Joel Becker Cc: Rusty Russell Cc: Jens Axboe Cc: Asai Thambi S P Cc: Selvan Mani Cc: Sam Bradshaw Cc: Jeff Moyer Cc: Al Viro Cc: Benjamin LaHaise Cc: Theodore Ts'o Signed-off-by: Benjamin LaHaise --- fs/aio.c | 69 +++++++++++++++++++++++++++++++++++++--------------------------- 1 file changed, 40 insertions(+), 29 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index d00904d..09fe1f3 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -723,8 +723,6 @@ static void kiocb_free(struct kiocb *req) eventfd_ctx_put(req->ki_eventfd); if (req->ki_dtor) req->ki_dtor(req); - if (req->ki_iovec != &req->ki_inline_vec) - kfree(req->ki_iovec); kmem_cache_free(kiocb_cachep, req); } @@ -1054,24 +1052,26 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) typedef ssize_t (aio_rw_op)(struct kiocb *, const struct iovec *, unsigned long, loff_t); -static ssize_t aio_setup_vectored_rw(int rw, struct kiocb *kiocb, bool compat) +static ssize_t aio_setup_vectored_rw(struct kiocb *kiocb, + int rw, char __user *buf, + unsigned long *nr_segs, + struct iovec **iovec, + bool compat) { ssize_t ret; - kiocb->ki_nr_segs = kiocb->ki_nbytes; + *nr_segs = kiocb->ki_nbytes; #ifdef CONFIG_COMPAT if (compat) ret = compat_rw_copy_check_uvector(rw, - (struct compat_iovec __user *)kiocb->ki_buf, - kiocb->ki_nr_segs, 1, &kiocb->ki_inline_vec, - &kiocb->ki_iovec); + (struct compat_iovec __user *)buf, + *nr_segs, 1, *iovec, iovec); else #endif ret = rw_copy_check_uvector(rw, - (struct iovec __user *)kiocb->ki_buf, - kiocb->ki_nr_segs, 1, &kiocb->ki_inline_vec, - &kiocb->ki_iovec); + (struct iovec __user *)buf, + *nr_segs, 1, *iovec, iovec); if (ret < 0) return ret; @@ -1080,15 +1080,17 @@ static ssize_t aio_setup_vectored_rw(int rw, struct kiocb *kiocb, bool compat) return 0; } -static ssize_t aio_setup_single_vector(int rw, struct kiocb *kiocb) +static ssize_t aio_setup_single_vector(struct kiocb *kiocb, + int rw, char __user *buf, + unsigned long *nr_segs, + struct iovec *iovec) { - if (unlikely(!access_ok(!rw, kiocb->ki_buf, kiocb->ki_nbytes))) + if (unlikely(!access_ok(!rw, buf, kiocb->ki_nbytes))) return -EFAULT; - kiocb->ki_iovec = &kiocb->ki_inline_vec; - kiocb->ki_iovec->iov_base = kiocb->ki_buf; - kiocb->ki_iovec->iov_len = kiocb->ki_nbytes; - kiocb->ki_nr_segs = 1; + iovec->iov_base = buf; + iovec->iov_len = kiocb->ki_nbytes; + *nr_segs = 1; return 0; } @@ -1097,15 +1099,18 @@ static ssize_t aio_setup_single_vector(int rw, struct kiocb *kiocb) * Performs the initial checks and aio retry method * setup for the kiocb at the time of io submission. */ -static ssize_t aio_run_iocb(struct kiocb *req, bool compat) +static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode, + char __user *buf, bool compat) { struct file *file = req->ki_filp; ssize_t ret; + unsigned long nr_segs; int rw; fmode_t mode; aio_rw_op *rw_op; + struct iovec inline_vec, *iovec = &inline_vec; - switch (req->ki_opcode) { + switch (opcode) { case IOCB_CMD_PREAD: case IOCB_CMD_PREADV: mode = FMODE_READ; @@ -1126,16 +1131,21 @@ rw_common: if (!rw_op) return -EINVAL; - ret = (req->ki_opcode == IOCB_CMD_PREADV || - req->ki_opcode == IOCB_CMD_PWRITEV) - ? aio_setup_vectored_rw(rw, req, compat) - : aio_setup_single_vector(rw, req); + ret = (opcode == IOCB_CMD_PREADV || + opcode == IOCB_CMD_PWRITEV) + ? aio_setup_vectored_rw(req, rw, buf, &nr_segs, + &iovec, compat) + : aio_setup_single_vector(req, rw, buf, &nr_segs, + iovec); if (ret) return ret; ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes); - if (ret < 0) + if (ret < 0) { + if (iovec != &inline_vec) + kfree(iovec); return ret; + } req->ki_nbytes = ret; @@ -1149,8 +1159,7 @@ rw_common: if (rw == WRITE) file_start_write(file); - ret = rw_op(req, req->ki_iovec, - req->ki_nr_segs, req->ki_pos); + ret = rw_op(req, iovec, nr_segs, req->ki_pos); if (rw == WRITE) file_end_write(file); @@ -1175,6 +1184,9 @@ rw_common: return -EINVAL; } + if (iovec != &inline_vec) + kfree(iovec); + if (ret != -EIOCBQUEUED) { /* * There's no easy way to restart the syscall since other AIO's @@ -1246,12 +1258,11 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, req->ki_obj.user = user_iocb; req->ki_user_data = iocb->aio_data; req->ki_pos = iocb->aio_offset; - - req->ki_buf = (char __user *)(unsigned long)iocb->aio_buf; req->ki_nbytes = iocb->aio_nbytes; - req->ki_opcode = iocb->aio_lio_opcode; - ret = aio_run_iocb(req, compat); + ret = aio_run_iocb(req, iocb->aio_lio_opcode, + (char __user *)(unsigned long)iocb->aio_buf, + compat); if (ret) goto out_put_req; -- cgit v1.1 From 57282d8fd744072d6d6f18fa6ebe3cc1149015bf Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 13 May 2013 13:42:52 -0700 Subject: aio: Kill ki_users The kiocb refcount is only needed for cancellation - to ensure a kiocb isn't freed while a ki_cancel callback is running. But if we restrict ki_cancel callbacks to not block (which they currently don't), we can simply drop the refcount. Signed-off-by: Kent Overstreet Cc: Zach Brown Cc: Felipe Balbi Cc: Greg Kroah-Hartman Cc: Mark Fasheh Cc: Joel Becker Cc: Rusty Russell Cc: Jens Axboe Cc: Asai Thambi S P Cc: Selvan Mani Cc: Sam Bradshaw Cc: Jeff Moyer Cc: Al Viro Cc: Benjamin LaHaise Cc: Theodore Ts'o Signed-off-by: Benjamin LaHaise --- fs/aio.c | 47 ++++++++++++----------------------------------- 1 file changed, 12 insertions(+), 35 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 09fe1f3..69a608a 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -361,7 +361,6 @@ EXPORT_SYMBOL(kiocb_set_cancel_fn); static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb) { kiocb_cancel_fn *old, *cancel; - int ret = -EINVAL; /* * Don't want to set kiocb->ki_cancel = KIOCB_CANCELLED unless it @@ -371,21 +370,13 @@ static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb) cancel = ACCESS_ONCE(kiocb->ki_cancel); do { if (!cancel || cancel == KIOCB_CANCELLED) - return ret; + return -EINVAL; old = cancel; cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED); } while (cancel != old); - atomic_inc(&kiocb->ki_users); - spin_unlock_irq(&ctx->ctx_lock); - - ret = cancel(kiocb); - - spin_lock_irq(&ctx->ctx_lock); - aio_put_req(kiocb); - - return ret; + return cancel(kiocb); } static void free_ioctx_rcu(struct rcu_head *head) @@ -599,16 +590,16 @@ static void kill_ioctx(struct kioctx *ctx) /* wait_on_sync_kiocb: * Waits on the given sync kiocb to complete. */ -ssize_t wait_on_sync_kiocb(struct kiocb *iocb) +ssize_t wait_on_sync_kiocb(struct kiocb *req) { - while (atomic_read(&iocb->ki_users)) { + while (!req->ki_ctx) { set_current_state(TASK_UNINTERRUPTIBLE); - if (!atomic_read(&iocb->ki_users)) + if (req->ki_ctx) break; io_schedule(); } __set_current_state(TASK_RUNNING); - return iocb->ki_user_data; + return req->ki_user_data; } EXPORT_SYMBOL(wait_on_sync_kiocb); @@ -687,14 +678,8 @@ out: } /* aio_get_req - * Allocate a slot for an aio request. Increments the ki_users count - * of the kioctx so that the kioctx stays around until all requests are - * complete. Returns NULL if no requests are free. - * - * Returns with kiocb->ki_users set to 2. The io submit code path holds - * an extra reference while submitting the i/o. - * This prevents races between the aio code path referencing the - * req (after submitting it) and aio_complete() freeing the req. + * Allocate a slot for an aio request. + * Returns NULL if no requests are free. */ static inline struct kiocb *aio_get_req(struct kioctx *ctx) { @@ -707,7 +692,6 @@ static inline struct kiocb *aio_get_req(struct kioctx *ctx) if (unlikely(!req)) goto out_put; - atomic_set(&req->ki_users, 1); req->ki_ctx = ctx; return req; out_put: @@ -726,13 +710,6 @@ static void kiocb_free(struct kiocb *req) kmem_cache_free(kiocb_cachep, req); } -void aio_put_req(struct kiocb *req) -{ - if (atomic_dec_and_test(&req->ki_users)) - kiocb_free(req); -} -EXPORT_SYMBOL(aio_put_req); - static struct kioctx *lookup_ioctx(unsigned long ctx_id) { struct mm_struct *mm = current->mm; @@ -771,9 +748,9 @@ void aio_complete(struct kiocb *iocb, long res, long res2) * - the sync task helpfully left a reference to itself in the iocb */ if (is_sync_kiocb(iocb)) { - BUG_ON(atomic_read(&iocb->ki_users) != 1); iocb->ki_user_data = res; - atomic_set(&iocb->ki_users, 0); + smp_wmb(); + iocb->ki_ctx = ERR_PTR(-EXDEV); wake_up_process(iocb->ki_obj.tsk); return; } @@ -845,7 +822,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2) eventfd_signal(iocb->ki_eventfd, 1); /* everything turned out well, dispose of the aiocb. */ - aio_put_req(iocb); + kiocb_free(iocb); /* * We have to order our ring_info tail store above and test @@ -1269,7 +1246,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, return 0; out_put_req: put_reqs_available(ctx, 1); - aio_put_req(req); + kiocb_free(req); return ret; } -- cgit v1.1 From d29c445b635b3a03cf683cafcbae58a4ec1e1125 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 18 Mar 2013 11:09:26 -0700 Subject: aio: Kill ki_dtor sock_aio_dtor() is dead code - and stuff that does need to do cleanup can simply do it before calling aio_complete(). Signed-off-by: Kent Overstreet Cc: Zach Brown Cc: Felipe Balbi Cc: Greg Kroah-Hartman Cc: Mark Fasheh Cc: Joel Becker Cc: Rusty Russell Cc: Jens Axboe Cc: Asai Thambi S P Cc: Selvan Mani Cc: Sam Bradshaw Cc: Jeff Moyer Cc: Al Viro Cc: Benjamin LaHaise Cc: Theodore Ts'o Signed-off-by: Benjamin LaHaise --- fs/aio.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 69a608a..e46b1195 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -705,8 +705,6 @@ static void kiocb_free(struct kiocb *req) fput(req->ki_filp); if (req->ki_eventfd != NULL) eventfd_ctx_put(req->ki_eventfd); - if (req->ki_dtor) - req->ki_dtor(req); kmem_cache_free(kiocb_cachep, req); } -- cgit v1.1 From 4cd81c3dfc4a34e4a0b6fa577860077c8e5b13af Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Tue, 30 Jul 2013 12:06:37 -0400 Subject: aio: double aio_max_nr in calculations With the changes to use percpu counters for aio event ring size calculation, existing increases to aio_max_nr are now insufficient to allow for the allocation of enough events. Double the value used for aio_max_nr to account for the doubling introduced by the percpu slack. Signed-off-by: Benjamin LaHaise --- fs/aio.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index e46b1195..945dd0d 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -490,7 +490,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) return ERR_PTR(-EINVAL); } - if (!nr_events || (unsigned long)nr_events > aio_max_nr) + if (!nr_events || (unsigned long)nr_events > (aio_max_nr * 2UL)) return ERR_PTR(-EAGAIN); ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL); @@ -522,7 +522,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) /* limit the number of system wide aios */ spin_lock(&aio_nr_lock); - if (aio_nr + nr_events > aio_max_nr || + if (aio_nr + nr_events > (aio_max_nr * 2UL) || aio_nr + nr_events < aio_nr) { spin_unlock(&aio_nr_lock); goto out_cleanup; -- cgit v1.1 From db446a08c23d5475e6b08c87acca79ebb20f283c Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Tue, 30 Jul 2013 12:54:40 -0400 Subject: aio: convert the ioctx list to table lookup v3 On Wed, Jun 12, 2013 at 11:14:40AM -0700, Kent Overstreet wrote: > On Mon, Apr 15, 2013 at 02:40:55PM +0300, Octavian Purdila wrote: > > When using a large number of threads performing AIO operations the > > IOCTX list may get a significant number of entries which will cause > > significant overhead. For example, when running this fio script: > > > > rw=randrw; size=256k ;directory=/mnt/fio; ioengine=libaio; iodepth=1 > > blocksize=1024; numjobs=512; thread; loops=100 > > > > on an EXT2 filesystem mounted on top of a ramdisk we can observe up to > > 30% CPU time spent by lookup_ioctx: > > > > 32.51% [guest.kernel] [g] lookup_ioctx > > 9.19% [guest.kernel] [g] __lock_acquire.isra.28 > > 4.40% [guest.kernel] [g] lock_release > > 4.19% [guest.kernel] [g] sched_clock_local > > 3.86% [guest.kernel] [g] local_clock > > 3.68% [guest.kernel] [g] native_sched_clock > > 3.08% [guest.kernel] [g] sched_clock_cpu > > 2.64% [guest.kernel] [g] lock_release_holdtime.part.11 > > 2.60% [guest.kernel] [g] memcpy > > 2.33% [guest.kernel] [g] lock_acquired > > 2.25% [guest.kernel] [g] lock_acquire > > 1.84% [guest.kernel] [g] do_io_submit > > > > This patchs converts the ioctx list to a radix tree. For a performance > > comparison the above FIO script was run on a 2 sockets 8 core > > machine. This are the results (average and %rsd of 10 runs) for the > > original list based implementation and for the radix tree based > > implementation: > > > > cores 1 2 4 8 16 32 > > list 109376 ms 69119 ms 35682 ms 22671 ms 19724 ms 16408 ms > > %rsd 0.69% 1.15% 1.17% 1.21% 1.71% 1.43% > > radix 73651 ms 41748 ms 23028 ms 16766 ms 15232 ms 13787 ms > > %rsd 1.19% 0.98% 0.69% 1.13% 0.72% 0.75% > > % of radix > > relative 66.12% 65.59% 66.63% 72.31% 77.26% 83.66% > > to list > > > > To consider the impact of the patch on the typical case of having > > only one ctx per process the following FIO script was run: > > > > rw=randrw; size=100m ;directory=/mnt/fio; ioengine=libaio; iodepth=1 > > blocksize=1024; numjobs=1; thread; loops=100 > > > > on the same system and the results are the following: > > > > list 58892 ms > > %rsd 0.91% > > radix 59404 ms > > %rsd 0.81% > > % of radix > > relative 100.87% > > to list > > So, I was just doing some benchmarking/profiling to get ready to send > out the aio patches I've got for 3.11 - and it looks like your patch is > causing a ~1.5% throughput regression in my testing :/ ... I've got an alternate approach for fixing this wart in lookup_ioctx()... Instead of using an rbtree, just use the reserved id in the ring buffer header to index an array pointing the ioctx. It's not finished yet, and it needs to be tidied up, but is most of the way there. -ben -- "Thought is the essence of where you are now." -- kmo> And, a rework of Ben's code, but this was entirely his idea kmo> -Kent bcrl> And fix the code to use the right mm_struct in kill_ioctx(), actually free memory. Signed-off-by: Benjamin LaHaise --- fs/aio.c | 136 ++++++++++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 114 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 945dd0d..52f200e 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -66,6 +66,12 @@ struct aio_ring { #define AIO_RING_PAGES 8 +struct kioctx_table { + struct rcu_head rcu; + unsigned nr; + struct kioctx *table[]; +}; + struct kioctx_cpu { unsigned reqs_available; }; @@ -74,9 +80,7 @@ struct kioctx { struct percpu_ref users; atomic_t dead; - /* This needs improving */ unsigned long user_id; - struct hlist_node list; struct __percpu kioctx_cpu *cpu; @@ -135,6 +139,8 @@ struct kioctx { struct page *internal_pages[AIO_RING_PAGES]; struct file *aio_ring_file; + + unsigned id; }; /*------ sysctl variables----*/ @@ -326,7 +332,7 @@ static int aio_setup_ring(struct kioctx *ctx) ring = kmap_atomic(ctx->ring_pages[0]); ring->nr = nr_events; /* user copy */ - ring->id = ctx->user_id; + ring->id = ~0U; ring->head = ring->tail = 0; ring->magic = AIO_RING_MAGIC; ring->compat_features = AIO_RING_COMPAT_FEATURES; @@ -462,6 +468,58 @@ static void free_ioctx_ref(struct percpu_ref *ref) schedule_work(&ctx->free_work); } +static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) +{ + unsigned i, new_nr; + struct kioctx_table *table, *old; + struct aio_ring *ring; + + spin_lock(&mm->ioctx_lock); + table = rcu_dereference(mm->ioctx_table); + + while (1) { + if (table) + for (i = 0; i < table->nr; i++) + if (!table->table[i]) { + ctx->id = i; + table->table[i] = ctx; + spin_unlock(&mm->ioctx_lock); + + ring = kmap_atomic(ctx->ring_pages[0]); + ring->id = ctx->id; + kunmap_atomic(ring); + return 0; + } + + new_nr = (table ? table->nr : 1) * 4; + + spin_unlock(&mm->ioctx_lock); + + table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) * + new_nr, GFP_KERNEL); + if (!table) + return -ENOMEM; + + table->nr = new_nr; + + spin_lock(&mm->ioctx_lock); + old = rcu_dereference(mm->ioctx_table); + + if (!old) { + rcu_assign_pointer(mm->ioctx_table, table); + } else if (table->nr > old->nr) { + memcpy(table->table, old->table, + old->nr * sizeof(struct kioctx *)); + + rcu_assign_pointer(mm->ioctx_table, table); + kfree_rcu(old, rcu); + } else { + kfree(table); + table = old; + } + } +} + /* ioctx_alloc * Allocates and initializes an ioctx. Returns an ERR_PTR if it failed. */ @@ -520,6 +578,10 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4); BUG_ON(!ctx->req_batch); + err = ioctx_add_table(ctx, mm); + if (err) + goto out_cleanup_noerr; + /* limit the number of system wide aios */ spin_lock(&aio_nr_lock); if (aio_nr + nr_events > (aio_max_nr * 2UL) || @@ -532,17 +594,13 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) percpu_ref_get(&ctx->users); /* io_setup() will drop this ref */ - /* now link into global list. */ - spin_lock(&mm->ioctx_lock); - hlist_add_head_rcu(&ctx->list, &mm->ioctx_list); - spin_unlock(&mm->ioctx_lock); - pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", ctx, ctx->user_id, mm, ctx->nr_events); return ctx; out_cleanup: err = -EAGAIN; +out_cleanup_noerr: aio_free_ring(ctx); out_freepcpu: free_percpu(ctx->cpu); @@ -561,10 +619,18 @@ out_freectx: * when the processes owning a context have all exited to encourage * the rapid destruction of the kioctx. */ -static void kill_ioctx(struct kioctx *ctx) +static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx) { if (!atomic_xchg(&ctx->dead, 1)) { - hlist_del_rcu(&ctx->list); + struct kioctx_table *table; + + spin_lock(&mm->ioctx_lock); + table = rcu_dereference(mm->ioctx_table); + + WARN_ON(ctx != table->table[ctx->id]); + table->table[ctx->id] = NULL; + spin_unlock(&mm->ioctx_lock); + /* percpu_ref_kill() will do the necessary call_rcu() */ wake_up_all(&ctx->wait); @@ -613,10 +679,28 @@ EXPORT_SYMBOL(wait_on_sync_kiocb); */ void exit_aio(struct mm_struct *mm) { + struct kioctx_table *table; struct kioctx *ctx; - struct hlist_node *n; + unsigned i = 0; + + while (1) { + rcu_read_lock(); + table = rcu_dereference(mm->ioctx_table); + + do { + if (!table || i >= table->nr) { + rcu_read_unlock(); + rcu_assign_pointer(mm->ioctx_table, NULL); + if (table) + kfree(table); + return; + } + + ctx = table->table[i++]; + } while (!ctx); + + rcu_read_unlock(); - hlist_for_each_entry_safe(ctx, n, &mm->ioctx_list, list) { /* * We don't need to bother with munmap() here - * exit_mmap(mm) is coming and it'll unmap everything. @@ -627,7 +711,7 @@ void exit_aio(struct mm_struct *mm) */ ctx->mmap_size = 0; - kill_ioctx(ctx); + kill_ioctx(mm, ctx); } } @@ -710,19 +794,27 @@ static void kiocb_free(struct kiocb *req) static struct kioctx *lookup_ioctx(unsigned long ctx_id) { + struct aio_ring __user *ring = (void __user *)ctx_id; struct mm_struct *mm = current->mm; struct kioctx *ctx, *ret = NULL; + struct kioctx_table *table; + unsigned id; + + if (get_user(id, &ring->id)) + return NULL; rcu_read_lock(); + table = rcu_dereference(mm->ioctx_table); - hlist_for_each_entry_rcu(ctx, &mm->ioctx_list, list) { - if (ctx->user_id == ctx_id) { - percpu_ref_get(&ctx->users); - ret = ctx; - break; - } - } + if (!table || id >= table->nr) + goto out; + ctx = table->table[id]; + if (ctx->user_id == ctx_id) { + percpu_ref_get(&ctx->users); + ret = ctx; + } +out: rcu_read_unlock(); return ret; } @@ -998,7 +1090,7 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp) if (!IS_ERR(ioctx)) { ret = put_user(ioctx->user_id, ctxp); if (ret) - kill_ioctx(ioctx); + kill_ioctx(current->mm, ioctx); percpu_ref_put(&ioctx->users); } @@ -1016,7 +1108,7 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) { struct kioctx *ioctx = lookup_ioctx(ctx); if (likely(NULL != ioctx)) { - kill_ioctx(ioctx); + kill_ioctx(current->mm, ioctx); percpu_ref_put(&ioctx->users); return 0; } -- cgit v1.1 From 6878ea72a5d1aa6caae86449975a50b7fe9abed5 Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Wed, 31 Jul 2013 10:34:18 -0400 Subject: aio: be defensive to ensure request batching is non-zero instead of BUG_ON() In the event that an overflow/underflow occurs while calculating req_batch, clamp the minimum at 1 request instead of doing a BUG_ON(). Signed-off-by: Benjamin LaHaise --- fs/aio.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 52f200e..588aff9 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -576,7 +576,8 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) atomic_set(&ctx->reqs_available, ctx->nr_events - 1); ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4); - BUG_ON(!ctx->req_batch); + if (ctx->req_batch < 1) + ctx->req_batch = 1; err = ioctx_add_table(ctx, mm); if (err) -- cgit v1.1 From da90382c2ec367aac88ff6aa76afb659ee0e4235 Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Mon, 5 Aug 2013 13:21:43 -0400 Subject: aio: fix error handling and rcu usage in "convert the ioctx list to table lookup v3" In the patch "aio: convert the ioctx list to table lookup v3", incorrect handling in the ioctx_alloc() error path was introduced that lead to an ioctx being added via ioctx_add_table() while freed when the ioctx_alloc() call returned -EAGAIN due to hitting the aio_max_nr limit. Fix this by only calling ioctx_add_table() as the last step in ioctx_alloc(). Also, several unnecessary rcu_dereference() calls were added that lead to RCU warnings where the system was already protected by a spin lock for accessing mm->ioctx_table. Signed-off-by: Benjamin LaHaise --- fs/aio.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 588aff9..3bc068c 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -475,7 +475,7 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) struct aio_ring *ring; spin_lock(&mm->ioctx_lock); - table = rcu_dereference(mm->ioctx_table); + table = mm->ioctx_table; while (1) { if (table) @@ -503,7 +503,7 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) table->nr = new_nr; spin_lock(&mm->ioctx_lock); - old = rcu_dereference(mm->ioctx_table); + old = mm->ioctx_table; if (!old) { rcu_assign_pointer(mm->ioctx_table, table); @@ -579,10 +579,6 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) if (ctx->req_batch < 1) ctx->req_batch = 1; - err = ioctx_add_table(ctx, mm); - if (err) - goto out_cleanup_noerr; - /* limit the number of system wide aios */ spin_lock(&aio_nr_lock); if (aio_nr + nr_events > (aio_max_nr * 2UL) || @@ -595,13 +591,18 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) percpu_ref_get(&ctx->users); /* io_setup() will drop this ref */ + err = ioctx_add_table(ctx, mm); + if (err) + goto out_cleanup_put; + pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", ctx, ctx->user_id, mm, ctx->nr_events); return ctx; +out_cleanup_put: + percpu_ref_put(&ctx->users); out_cleanup: err = -EAGAIN; -out_cleanup_noerr: aio_free_ring(ctx); out_freepcpu: free_percpu(ctx->cpu); @@ -626,7 +627,7 @@ static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx) struct kioctx_table *table; spin_lock(&mm->ioctx_lock); - table = rcu_dereference(mm->ioctx_table); + table = mm->ioctx_table; WARN_ON(ctx != table->table[ctx->id]); table->table[ctx->id] = NULL; -- cgit v1.1 From f30d704fe1244c44a984d3d1f47bc648bcc6c9f7 Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Wed, 7 Aug 2013 18:23:48 -0400 Subject: aio: table lookup: verify ctx pointer Another shortcoming of the table lookup patch was revealed where the pointer was not being tested before being dereferenced. Verify this to avoid the NULL pointer dereference. Signed-off-by: Benjamin LaHaise --- fs/aio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 3bc068c..c3f005d 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -812,7 +812,7 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) goto out; ctx = table->table[id]; - if (ctx->user_id == ctx_id) { + if (ctx && ctx->user_id == ctx_id) { percpu_ref_get(&ctx->users); ret = ctx; } -- cgit v1.1 From 79bd1bcf1ab22ea723da7d5854a9e72a350ecbf8 Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Fri, 30 Aug 2013 10:22:04 -0400 Subject: aio: remove unnecessary debugging from aio_free_ring() The commit 36bc08cc0170 ("fs/aio: Add support to aio ring pages migration") added some debugging code that is not required and resulted in a build error when 98474236f72e ("vfs: make the dentry cache use the lockref infrastructure") was added to the tree. The code is not required, so just delete it. Signed-off-by: Benjamin LaHaise --- fs/aio.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index c3f005d..d0defcb 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -183,11 +183,6 @@ static void aio_free_ring(struct kioctx *ctx) if (aio_ring_file) { truncate_setsize(aio_ring_file->f_inode, 0); - pr_debug("pid(%d) i_nlink=%u d_count=%d d_unhashed=%d i_count=%d\n", - current->pid, aio_ring_file->f_inode->i_nlink, - aio_ring_file->f_path.dentry->d_count, - d_unhashed(aio_ring_file->f_path.dentry), - atomic_read(&aio_ring_file->f_inode->i_count)); fput(aio_ring_file); ctx->aio_ring_file = NULL; } -- cgit v1.1 From 77d30b14d24e557f89c41980011d72428514d729 Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Fri, 30 Aug 2013 11:12:50 -0400 Subject: aio: fix rcu sparse warnings introduced by ioctx table lookup patch Sseveral sparse warnings were caused by missing rcu_dereference() annotations for dereferencing mm->ioctx_table. Thankfully, none of those were actual bugs as the deref was protected by a spin lock in all instances. Signed-off-by: Benjamin LaHaise Reported-by: Fengguang Wu --- fs/aio.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index d0defcb..6e26755 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -470,7 +470,7 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) struct aio_ring *ring; spin_lock(&mm->ioctx_lock); - table = mm->ioctx_table; + table = rcu_dereference(mm->ioctx_table); while (1) { if (table) @@ -498,7 +498,7 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) table->nr = new_nr; spin_lock(&mm->ioctx_lock); - old = mm->ioctx_table; + old = rcu_dereference(mm->ioctx_table); if (!old) { rcu_assign_pointer(mm->ioctx_table, table); @@ -622,7 +622,7 @@ static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx) struct kioctx_table *table; spin_lock(&mm->ioctx_lock); - table = mm->ioctx_table; + table = rcu_dereference(mm->ioctx_table); WARN_ON(ctx != table->table[ctx->id]); table->table[ctx->id] = NULL; -- cgit v1.1 From d6c355c7dabcd753a75bc77d150d36328a355267 Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Mon, 9 Sep 2013 11:57:59 -0400 Subject: aio: fix race in ring buffer page lookup introduced by page migration support Prior to the introduction of page migration support in "fs/aio: Add support to aio ring pages migration" / 36bc08cc01709b4a9bb563b35aa530241ddc63e3, mapping of the ring buffer pages was done via get_user_pages() while retaining mmap_sem held for write. This avoided possible races with userland racing an munmap() or mremap(). The page migration patch, however, switched to using mm_populate() to prime the page mapping. mm_populate() cannot be called with mmap_sem held. Instead of dropping the mmap_sem, revert to the old behaviour and simply drop the use of mm_populate() since get_user_pages() will cause the pages to get mapped anyways. Thanks to Al Viro for spotting this issue. Signed-off-by: Benjamin LaHaise --- fs/aio.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 6e26755..f4a27af 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -307,16 +307,25 @@ static int aio_setup_ring(struct kioctx *ctx) aio_free_ring(ctx); return -EAGAIN; } - up_write(&mm->mmap_sem); - - mm_populate(ctx->mmap_base, populate); pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base); + + /* We must do this while still holding mmap_sem for write, as we + * need to be protected against userspace attempting to mremap() + * or munmap() the ring buffer. + */ ctx->nr_pages = get_user_pages(current, mm, ctx->mmap_base, nr_pages, 1, 0, ctx->ring_pages, NULL); + + /* Dropping the reference here is safe as the page cache will hold + * onto the pages for us. It is also required so that page migration + * can unmap the pages and get the right reference count. + */ for (i = 0; i < ctx->nr_pages; i++) put_page(ctx->ring_pages[i]); + up_write(&mm->mmap_sem); + if (unlikely(ctx->nr_pages != nr_pages)) { aio_free_ring(ctx); return -EAGAIN; -- cgit v1.1 From d9b2c8714aef102dea95544a8cd9372b21af463f Mon Sep 17 00:00:00 2001 From: Artem Savkov Date: Mon, 9 Sep 2013 12:29:35 -0400 Subject: aio: rcu_read_lock protection for new rcu_dereference calls Patch "aio: fix rcu sparse warnings introduced by ioctx table lookup patch" (77d30b14d24e557f89c41980011d72428514d729 in linux-next.git) introduced a couple of new rcu_dereference calls which are not protected by rcu_read_lock and result in following warnings during syscall fuzzing(trinity): [ 471.646379] =============================== [ 471.649727] [ INFO: suspicious RCU usage. ] [ 471.653919] 3.11.0-next-20130906+ #496 Not tainted [ 471.657792] ------------------------------- [ 471.661235] fs/aio.c:503 suspicious rcu_dereference_check() usage! [ 471.665968] [ 471.665968] other info that might help us debug this: [ 471.665968] [ 471.672141] [ 471.672141] rcu_scheduler_active = 1, debug_locks = 1 [ 471.677549] 1 lock held by trinity-child0/3774: [ 471.681675] #0: (&(&mm->ioctx_lock)->rlock){+.+...}, at: [] SyS_io_setup+0x63a/0xc70 [ 471.688721] [ 471.688721] stack backtrace: [ 471.692488] CPU: 1 PID: 3774 Comm: trinity-child0 Not tainted 3.11.0-next-20130906+ #496 [ 471.698437] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 471.703151] 00000000 00000000 c58bbf30 c18a814b de2234c0 c58bbf58 c10a4ec6 c1b0d824 [ 471.709544] c1b0f60e 00000001 00000001 c1af61b0 00000000 cb670ac0 c3aca000 c58bbfac [ 471.716251] c119bc7c 00000002 00000001 00000000 c119b8dd 00000000 c10cf684 c58bbfb4 [ 471.722902] Call Trace: [ 471.724859] [] dump_stack+0x4b/0x66 [ 471.728772] [] lockdep_rcu_suspicious+0xc6/0x100 [ 471.733716] [] SyS_io_setup+0x89c/0xc70 [ 471.737806] [] ? SyS_io_setup+0x4fd/0xc70 [ 471.741689] [] ? __audit_syscall_entry+0x94/0xe0 [ 471.746080] [] syscall_call+0x7/0xb [ 471.749723] [] ? task_fork_fair+0x240/0x260 Signed-off-by: Artem Savkov Reviewed-by: Gu Zheng Signed-off-by: Benjamin LaHaise --- fs/aio.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index f4a27af..6b868f0 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -479,6 +479,7 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) struct aio_ring *ring; spin_lock(&mm->ioctx_lock); + rcu_read_lock(); table = rcu_dereference(mm->ioctx_table); while (1) { @@ -487,6 +488,7 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) if (!table->table[i]) { ctx->id = i; table->table[i] = ctx; + rcu_read_unlock(); spin_unlock(&mm->ioctx_lock); ring = kmap_atomic(ctx->ring_pages[0]); @@ -497,6 +499,7 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) new_nr = (table ? table->nr : 1) * 4; + rcu_read_unlock(); spin_unlock(&mm->ioctx_lock); table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) * @@ -507,6 +510,7 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) table->nr = new_nr; spin_lock(&mm->ioctx_lock); + rcu_read_lock(); old = rcu_dereference(mm->ioctx_table); if (!old) { @@ -631,10 +635,12 @@ static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx) struct kioctx_table *table; spin_lock(&mm->ioctx_lock); + rcu_read_lock(); table = rcu_dereference(mm->ioctx_table); WARN_ON(ctx != table->table[ctx->id]); table->table[ctx->id] = NULL; + rcu_read_unlock(); spin_unlock(&mm->ioctx_lock); /* percpu_ref_kill() will do the necessary call_rcu() */ -- cgit v1.1