summaryrefslogtreecommitdiffstats
path: root/fs/direct-io.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2011-07-22 19:02:39 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2011-07-22 19:02:39 -0700
commitbbd9d6f7fbb0305c9a592bf05a32e87eb364a4ff (patch)
tree12b2bb4202b05f6ae6a43c6ce830a0472043dbe5 /fs/direct-io.c
parent8e204874db000928e37199c2db82b7eb8966cc3c (diff)
parent5a9a43646cf709312d71eca71cef90ad802f28f9 (diff)
downloadop-kernel-dev-bbd9d6f7fbb0305c9a592bf05a32e87eb364a4ff.zip
op-kernel-dev-bbd9d6f7fbb0305c9a592bf05a32e87eb364a4ff.tar.gz
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs-2.6
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs-2.6: (107 commits) vfs: use ERR_CAST for err-ptr tossing in lookup_instantiate_filp isofs: Remove global fs lock jffs2: fix IN_DELETE_SELF on overwriting rename() killing a directory fix IN_DELETE_SELF on overwriting rename() on ramfs et.al. mm/truncate.c: fix build for CONFIG_BLOCK not enabled fs:update the NOTE of the file_operations structure Remove dead code in dget_parent() AFS: Fix silly characters in a comment switch d_add_ci() to d_splice_alias() in "found negative" case as well simplify gfs2_lookup() jfs_lookup(): don't bother with . or .. get rid of useless dget_parent() in btrfs rename() and link() get rid of useless dget_parent() in fs/btrfs/ioctl.c fs: push i_mutex and filemap_write_and_wait down into ->fsync() handlers drivers: fix up various ->llseek() implementations fs: handle SEEK_HOLE/SEEK_DATA properly in all fs's that define their own llseek Ext4: handle SEEK_HOLE/SEEK_DATA generically Btrfs: implement our own ->llseek fs: add SEEK_HOLE and SEEK_DATA flags reiserfs: make reiserfs default to barrier=flush ... Fix up trivial conflicts in fs/xfs/linux-2.6/xfs_super.c due to the new shrinker callout for the inode cache, that clashed with the xfs code to start the periodic workers later.
Diffstat (limited to 'fs/direct-io.c')
-rw-r--r--fs/direct-io.c88
1 files changed, 65 insertions, 23 deletions
diff --git a/fs/direct-io.c b/fs/direct-io.c
index ac5f164..01d2d9e 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -135,6 +135,50 @@ struct dio {
struct page *pages[DIO_PAGES]; /* page buffer */
};
+static void __inode_dio_wait(struct inode *inode)
+{
+ wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP);
+ DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP);
+
+ do {
+ prepare_to_wait(wq, &q.wait, TASK_UNINTERRUPTIBLE);
+ if (atomic_read(&inode->i_dio_count))
+ schedule();
+ } while (atomic_read(&inode->i_dio_count));
+ finish_wait(wq, &q.wait);
+}
+
+/**
+ * inode_dio_wait - wait for outstanding DIO requests to finish
+ * @inode: inode to wait for
+ *
+ * Waits for all pending direct I/O requests to finish so that we can
+ * proceed with a truncate or equivalent operation.
+ *
+ * Must be called under a lock that serializes taking new references
+ * to i_dio_count, usually by inode->i_mutex.
+ */
+void inode_dio_wait(struct inode *inode)
+{
+ if (atomic_read(&inode->i_dio_count))
+ __inode_dio_wait(inode);
+}
+EXPORT_SYMBOL_GPL(inode_dio_wait);
+
+/*
+ * inode_dio_done - signal finish of a direct I/O requests
+ * @inode: inode the direct I/O happens on
+ *
+ * This is called once we've finished processing a direct I/O request,
+ * and is used to wake up callers waiting for direct I/O to be quiesced.
+ */
+void inode_dio_done(struct inode *inode)
+{
+ if (atomic_dec_and_test(&inode->i_dio_count))
+ wake_up_bit(&inode->i_state, __I_DIO_WAKEUP);
+}
+EXPORT_SYMBOL_GPL(inode_dio_done);
+
/*
* How many pages are in the queue?
*/
@@ -249,14 +293,12 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is
if (dio->end_io && dio->result) {
dio->end_io(dio->iocb, offset, transferred,
dio->map_bh.b_private, ret, is_async);
- } else if (is_async) {
- aio_complete(dio->iocb, ret, 0);
+ } else {
+ if (is_async)
+ aio_complete(dio->iocb, ret, 0);
+ inode_dio_done(dio->inode);
}
- if (dio->flags & DIO_LOCKING)
- /* lockdep: non-owner release */
- up_read_non_owner(&dio->inode->i_alloc_sem);
-
return ret;
}
@@ -980,9 +1022,6 @@ out:
return ret;
}
-/*
- * Releases both i_mutex and i_alloc_sem
- */
static ssize_t
direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
const struct iovec *iov, loff_t offset, unsigned long nr_segs,
@@ -1146,15 +1185,16 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
* For writes this function is called under i_mutex and returns with
* i_mutex held, for reads, i_mutex is not held on entry, but it is
* taken and dropped again before returning.
- * For reads and writes i_alloc_sem is taken in shared mode and released
- * on I/O completion (which may happen asynchronously after returning to
- * the caller).
- *
* - if the flags value does NOT contain DIO_LOCKING we don't use any
* internal locking but rather rely on the filesystem to synchronize
* direct I/O reads/writes versus each other and truncate.
- * For reads and writes both i_mutex and i_alloc_sem are not held on
- * entry and are never taken.
+ *
+ * To help with locking against truncate we incremented the i_dio_count
+ * counter before starting direct I/O, and decrement it once we are done.
+ * Truncate can wait for it to reach zero to provide exclusion. It is
+ * expected that filesystem provide exclusion between new direct I/O
+ * and truncates. For DIO_LOCKING filesystems this is done by i_mutex,
+ * but other filesystems need to take care of this on their own.
*/
ssize_t
__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
@@ -1200,6 +1240,10 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
}
}
+ /* watch out for a 0 len io from a tricksy fs */
+ if (rw == READ && end == offset)
+ return 0;
+
dio = kmalloc(sizeof(*dio), GFP_KERNEL);
retval = -ENOMEM;
if (!dio)
@@ -1213,8 +1257,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
dio->flags = flags;
if (dio->flags & DIO_LOCKING) {
- /* watch out for a 0 len io from a tricksy fs */
- if (rw == READ && end > offset) {
+ if (rw == READ) {
struct address_space *mapping =
iocb->ki_filp->f_mapping;
@@ -1229,15 +1272,14 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
goto out;
}
}
-
- /*
- * Will be released at I/O completion, possibly in a
- * different thread.
- */
- down_read_non_owner(&inode->i_alloc_sem);
}
/*
+ * Will be decremented at I/O completion time.
+ */
+ atomic_inc(&inode->i_dio_count);
+
+ /*
* For file extending writes updating i_size before data
* writeouts complete can expose uninitialized blocks. So
* even for AIO, we need to wait for i/o to complete before
OpenPOWER on IntegriCloud