summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/filesystems/vfs.txt44
-rw-r--r--MAINTAINERS6
-rw-r--r--drivers/dax/device.c1
-rw-r--r--fs/block_dev.c3
-rw-r--r--fs/btrfs/file.c13
-rw-r--r--fs/buffer.c20
-rw-r--r--fs/dax.c4
-rw-r--r--fs/ext2/file.c5
-rw-r--r--fs/ext4/fsync.c2
-rw-r--r--fs/file_table.c1
-rw-r--r--fs/gfs2/lops.c2
-rw-r--r--fs/jbd2/commit.c16
-rw-r--r--fs/libfs.c6
-rw-r--r--fs/open.c3
-rw-r--r--fs/xfs/xfs_file.c2
-rw-r--r--include/linux/buffer_head.h1
-rw-r--r--include/linux/errseq.h19
-rw-r--r--include/linux/fs.h61
-rw-r--r--include/linux/pagemap.h31
-rw-r--r--include/trace/events/filemap.h57
-rw-r--r--lib/Makefile2
-rw-r--r--lib/errseq.c208
-rw-r--r--mm/filemap.c126
-rw-r--r--mm/memory-failure.c2
24 files changed, 572 insertions, 63 deletions
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index f42b906..48c9faa 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -576,7 +576,43 @@ should clear PG_Dirty and set PG_Writeback. It can be actually
written at any point after PG_Dirty is clear. Once it is known to be
safe, PG_Writeback is cleared.
-Writeback makes use of a writeback_control structure...
+Writeback makes use of a writeback_control structure to direct the
+operations. This gives the the writepage and writepages operations some
+information about the nature of and reason for the writeback request,
+and the constraints under which it is being done. It is also used to
+return information back to the caller about the result of a writepage or
+writepages request.
+
+Handling errors during writeback
+--------------------------------
+Most applications that do buffered I/O will periodically call a file
+synchronization call (fsync, fdatasync, msync or sync_file_range) to
+ensure that data written has made it to the backing store. When there
+is an error during writeback, they expect that error to be reported when
+a file sync request is made. After an error has been reported on one
+request, subsequent requests on the same file descriptor should return
+0, unless further writeback errors have occurred since the previous file
+syncronization.
+
+Ideally, the kernel would report errors only on file descriptions on
+which writes were done that subsequently failed to be written back. The
+generic pagecache infrastructure does not track the file descriptions
+that have dirtied each individual page however, so determining which
+file descriptors should get back an error is not possible.
+
+Instead, the generic writeback error tracking infrastructure in the
+kernel settles for reporting errors to fsync on all file descriptions
+that were open at the time that the error occurred. In a situation with
+multiple writers, all of them will get back an error on a subsequent fsync,
+even if all of the writes done through that particular file descriptor
+succeeded (or even if there were no writes on that file descriptor at all).
+
+Filesystems that wish to use this infrastructure should call
+mapping_set_error to record the error in the address_space when it
+occurs. Then, after writing back data from the pagecache in their
+file->fsync operation, they should call file_check_and_advance_wb_err to
+ensure that the struct file's error cursor has advanced to the correct
+point in the stream of errors emitted by the backing device(s).
struct address_space_operations
-------------------------------
@@ -804,7 +840,8 @@ struct address_space_operations {
The File Object
===============
-A file object represents a file opened by a process.
+A file object represents a file opened by a process. This is also known
+as an "open file description" in POSIX parlance.
struct file_operations
@@ -887,7 +924,8 @@ otherwise noted.
release: called when the last reference to an open file is closed
- fsync: called by the fsync(2) system call
+ fsync: called by the fsync(2) system call. Also see the section above
+ entitled "Handling errors during writeback".
fasync: called by the fcntl(2) system call when asynchronous
(non-blocking) mode is enabled for a file
diff --git a/MAINTAINERS b/MAINTAINERS
index b31be75..71d4380 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5069,6 +5069,12 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/kristoffer/linux-hpc.git
F: drivers/video/fbdev/s1d13xxxfb.c
F: include/video/s1d13xxxfb.h
+ERRSEQ ERROR TRACKING INFRASTRUCTURE
+M: Jeff Layton <jlayton@poochiereds.net>
+S: Maintained
+F: lib/errseq.c
+F: include/linux/errseq.h
+
ET131X NETWORK DRIVER
M: Mark Einon <mark.einon@gmail.com>
S: Odd Fixes
diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index 006e657..12943d1 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -499,6 +499,7 @@ static int dax_open(struct inode *inode, struct file *filp)
inode->i_mapping = __dax_inode->i_mapping;
inode->i_mapping->host = __dax_inode;
filp->f_mapping = inode->i_mapping;
+ filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping);
filp->private_data = dev_dax;
inode->i_flags = S_DAX;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index a7df151..9941dc8 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -632,7 +632,7 @@ int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
struct block_device *bdev = I_BDEV(bd_inode);
int error;
- error = filemap_write_and_wait_range(filp->f_mapping, start, end);
+ error = file_write_and_wait_range(filp, start, end);
if (error)
return error;
@@ -1751,6 +1751,7 @@ static int blkdev_open(struct inode * inode, struct file * filp)
return -ENOMEM;
filp->f_mapping = bdev->bd_inode->i_mapping;
+ filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping);
return blkdev_get(bdev, filp->f_mode, filp);
}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 2433870..a85d790 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2032,7 +2032,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
struct btrfs_log_ctx ctx;
- int ret = 0;
+ int ret = 0, err;
bool full_sync = 0;
u64 len;
@@ -2051,7 +2051,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
*/
ret = start_ordered_ops(inode, start, end);
if (ret)
- return ret;
+ goto out;
inode_lock(inode);
atomic_inc(&root->log_batch);
@@ -2156,10 +2156,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* An ordered extent might have started before and completed
* already with io errors, in which case the inode was not
* updated and we end up here. So check the inode's mapping
- * flags for any errors that might have happened while doing
- * writeback of file data.
+ * for any errors that might have happened since we last
+ * checked called fsync.
*/
- ret = filemap_check_errors(inode->i_mapping);
+ ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err);
inode_unlock(inode);
goto out;
}
@@ -2248,6 +2248,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
ret = btrfs_end_transaction(trans);
}
out:
+ err = file_check_and_advance_wb_err(file);
+ if (!ret)
+ ret = err;
return ret > 0 ? -EIO : ret;
}
diff --git a/fs/buffer.c b/fs/buffer.c
index 5c2cba8..5234b15 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -178,7 +178,7 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
set_buffer_uptodate(bh);
} else {
buffer_io_error(bh, ", lost sync page write");
- set_buffer_write_io_error(bh);
+ mark_buffer_write_io_error(bh);
clear_buffer_uptodate(bh);
}
unlock_buffer(bh);
@@ -352,8 +352,7 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
set_buffer_uptodate(bh);
} else {
buffer_io_error(bh, ", lost async page write");
- mapping_set_error(page->mapping, -EIO);
- set_buffer_write_io_error(bh);
+ mark_buffer_write_io_error(bh);
clear_buffer_uptodate(bh);
SetPageError(page);
}
@@ -481,8 +480,6 @@ static void __remove_assoc_queue(struct buffer_head *bh)
{
list_del_init(&bh->b_assoc_buffers);
WARN_ON(!bh->b_assoc_map);
- if (buffer_write_io_error(bh))
- set_bit(AS_EIO, &bh->b_assoc_map->flags);
bh->b_assoc_map = NULL;
}
@@ -1181,6 +1178,17 @@ void mark_buffer_dirty(struct buffer_head *bh)
}
EXPORT_SYMBOL(mark_buffer_dirty);
+void mark_buffer_write_io_error(struct buffer_head *bh)
+{
+ set_buffer_write_io_error(bh);
+ /* FIXME: do we need to set this in both places? */
+ if (bh->b_page && bh->b_page->mapping)
+ mapping_set_error(bh->b_page->mapping, -EIO);
+ if (bh->b_assoc_map)
+ mapping_set_error(bh->b_assoc_map, -EIO);
+}
+EXPORT_SYMBOL(mark_buffer_write_io_error);
+
/*
* Decrement a buffer_head's reference count. If all buffers against a page
* have zero reference count, are clean and unlocked, and if the page is clean
@@ -3282,8 +3290,6 @@ drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
bh = head;
do {
- if (buffer_write_io_error(bh) && page->mapping)
- mapping_set_error(page->mapping, -EIO);
if (buffer_busy(bh))
goto failed;
bh = bh->b_this_page;
diff --git a/fs/dax.c b/fs/dax.c
index b1cd18d..306c2b6 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -855,8 +855,10 @@ int dax_writeback_mapping_range(struct address_space *mapping,
ret = dax_writeback_one(bdev, dax_dev, mapping,
indices[i], pvec.pages[i]);
- if (ret < 0)
+ if (ret < 0) {
+ mapping_set_error(mapping, ret);
goto out;
+ }
}
start_index = indices[pvec.nr - 1] + 1;
}
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index b21891a..d34d32b 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -174,15 +174,12 @@ int ext2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
int ret;
struct super_block *sb = file->f_mapping->host->i_sb;
- struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
ret = generic_file_fsync(file, start, end, datasync);
- if (ret == -EIO || test_and_clear_bit(AS_EIO, &mapping->flags)) {
+ if (ret == -EIO)
/* We don't really know where the IO error happened... */
ext2_error(sb, __func__,
"detected IO error when writing metadata buffers");
- ret = -EIO;
- }
return ret;
}
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 9d54960..aae2c39 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -124,7 +124,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
goto out;
}
- ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ ret = file_write_and_wait_range(file, start, end);
if (ret)
return ret;
/*
diff --git a/fs/file_table.c b/fs/file_table.c
index 954d510..72e861a 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -168,6 +168,7 @@ struct file *alloc_file(const struct path *path, fmode_t mode,
file->f_path = *path;
file->f_inode = path->dentry->d_inode;
file->f_mapping = path->dentry->d_inode->i_mapping;
+ file->f_wb_err = filemap_sample_wb_err(file->f_mapping);
if ((mode & FMODE_READ) &&
likely(fop->read || fop->read_iter))
mode |= FMODE_CAN_READ;
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index e5259cd..3010f9e 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -180,7 +180,7 @@ static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, struct bio_vec *bvec,
bh = bh->b_this_page;
do {
if (error)
- set_buffer_write_io_error(bh);
+ mark_buffer_write_io_error(bh);
unlock_buffer(bh);
next = bh->b_this_page;
size -= bh->b_size;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index b6b194e..3c1c313 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -263,18 +263,10 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
continue;
jinode->i_flags |= JI_COMMIT_RUNNING;
spin_unlock(&journal->j_list_lock);
- err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
- if (err) {
- /*
- * Because AS_EIO is cleared by
- * filemap_fdatawait_range(), set it again so
- * that user process can get -EIO from fsync().
- */
- mapping_set_error(jinode->i_vfs_inode->i_mapping, -EIO);
-
- if (!ret)
- ret = err;
- }
+ err = filemap_fdatawait_keep_errors(
+ jinode->i_vfs_inode->i_mapping);
+ if (!ret)
+ ret = err;
spin_lock(&journal->j_list_lock);
jinode->i_flags &= ~JI_COMMIT_RUNNING;
smp_mb();
diff --git a/fs/libfs.c b/fs/libfs.c
index a043953..3aabe55 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -974,7 +974,7 @@ int __generic_file_fsync(struct file *file, loff_t start, loff_t end,
int err;
int ret;
- err = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ err = file_write_and_wait_range(file, start, end);
if (err)
return err;
@@ -991,6 +991,10 @@ int __generic_file_fsync(struct file *file, loff_t start, loff_t end,
out:
inode_unlock(inode);
+ /* check and advance again to catch errors after syncing out buffers */
+ err = file_check_and_advance_wb_err(file);
+ if (ret == 0)
+ ret = err;
return ret;
}
EXPORT_SYMBOL(__generic_file_fsync);
diff --git a/fs/open.c b/fs/open.c
index 3fe0c4a..35bb784 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -707,6 +707,9 @@ static int do_dentry_open(struct file *f,
f->f_inode = inode;
f->f_mapping = inode->i_mapping;
+ /* Ensure that we skip any errors that predate opening of the file */
+ f->f_wb_err = filemap_sample_wb_err(f->f_mapping);
+
if (unlikely(f->f_flags & O_PATH)) {
f->f_mode = FMODE_PATH;
f->f_op = &empty_fops;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 17f27a2..51dfae5 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -140,7 +140,7 @@ xfs_file_fsync(
trace_xfs_file_fsync(ip);
- error = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ error = file_write_and_wait_range(file, start, end);
if (error)
return error;
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index bd029e52..e0abeba 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -149,6 +149,7 @@ void buffer_check_dirty_writeback(struct page *page,
*/
void mark_buffer_dirty(struct buffer_head *bh);
+void mark_buffer_write_io_error(struct buffer_head *bh);
void init_buffer(struct buffer_head *, bh_end_io_t *, void *);
void touch_buffer(struct buffer_head *bh);
void set_bh_page(struct buffer_head *bh,
diff --git a/include/linux/errseq.h b/include/linux/errseq.h
new file mode 100644
index 0000000..9e0d444
--- /dev/null
+++ b/include/linux/errseq.h
@@ -0,0 +1,19 @@
+#ifndef _LINUX_ERRSEQ_H
+#define _LINUX_ERRSEQ_H
+
+/* See lib/errseq.c for more info */
+
+typedef u32 errseq_t;
+
+errseq_t __errseq_set(errseq_t *eseq, int err);
+static inline void errseq_set(errseq_t *eseq, int err)
+{
+ /* Optimize for the common case of no error */
+ if (unlikely(err))
+ __errseq_set(eseq, err);
+}
+
+errseq_t errseq_sample(errseq_t *eseq);
+int errseq_check(errseq_t *eseq, errseq_t since);
+int errseq_check_and_advance(errseq_t *eseq, errseq_t *since);
+#endif
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 978fb59..0cfa471 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -32,6 +32,7 @@
#include <linux/workqueue.h>
#include <linux/delayed_call.h>
#include <linux/uuid.h>
+#include <linux/errseq.h>
#include <asm/byteorder.h>
#include <uapi/linux/fs.h>
@@ -401,6 +402,7 @@ struct address_space {
gfp_t gfp_mask; /* implicit gfp mask for allocations */
struct list_head private_list; /* ditto */
void *private_data; /* ditto */
+ errseq_t wb_err;
} __attribute__((aligned(sizeof(long))));
/*
* On most architectures that alignment is already the case; but
@@ -879,6 +881,7 @@ struct file {
struct list_head f_tfile_llink;
#endif /* #ifdef CONFIG_EPOLL */
struct address_space *f_mapping;
+ errseq_t f_wb_err;
} __attribute__((aligned(4))); /* lest something weird decides that 2 is OK */
struct file_handle {
@@ -2536,7 +2539,7 @@ extern int write_inode_now(struct inode *, int);
extern int filemap_fdatawrite(struct address_space *);
extern int filemap_flush(struct address_space *);
extern int filemap_fdatawait(struct address_space *);
-extern void filemap_fdatawait_keep_errors(struct address_space *);
+extern int filemap_fdatawait_keep_errors(struct address_space *mapping);
extern int filemap_fdatawait_range(struct address_space *, loff_t lstart,
loff_t lend);
extern bool filemap_range_has_page(struct address_space *, loff_t lstart,
@@ -2550,6 +2553,62 @@ extern int filemap_fdatawrite_range(struct address_space *mapping,
loff_t start, loff_t end);
extern int filemap_check_errors(struct address_space *mapping);
+extern void __filemap_set_wb_err(struct address_space *mapping, int err);
+extern int __must_check file_check_and_advance_wb_err(struct file *file);
+extern int __must_check file_write_and_wait_range(struct file *file,
+ loff_t start, loff_t end);
+
+/**
+ * filemap_set_wb_err - set a writeback error on an address_space
+ * @mapping: mapping in which to set writeback error
+ * @err: error to be set in mapping
+ *
+ * When writeback fails in some way, we must record that error so that
+ * userspace can be informed when fsync and the like are called. We endeavor
+ * to report errors on any file that was open at the time of the error. Some
+ * internal callers also need to know when writeback errors have occurred.
+ *
+ * When a writeback error occurs, most filesystems will want to call
+ * filemap_set_wb_err to record the error in the mapping so that it will be
+ * automatically reported whenever fsync is called on the file.
+ *
+ * FIXME: mention FS_* flag here?
+ */
+static inline void filemap_set_wb_err(struct address_space *mapping, int err)
+{
+ /* Fastpath for common case of no error */
+ if (unlikely(err))
+ __filemap_set_wb_err(mapping, err);
+}
+
+/**
+ * filemap_check_wb_error - has an error occurred since the mark was sampled?
+ * @mapping: mapping to check for writeback errors
+ * @since: previously-sampled errseq_t
+ *
+ * Grab the errseq_t value from the mapping, and see if it has changed "since"
+ * the given value was sampled.
+ *
+ * If it has then report the latest error set, otherwise return 0.
+ */
+static inline int filemap_check_wb_err(struct address_space *mapping,
+ errseq_t since)
+{
+ return errseq_check(&mapping->wb_err, since);
+}
+
+/**
+ * filemap_sample_wb_err - sample the current errseq_t to test for later errors
+ * @mapping: mapping to be sampled
+ *
+ * Writeback errors are always reported relative to a particular sample point
+ * in the past. This function provides those sample points.
+ */
+static inline errseq_t filemap_sample_wb_err(struct address_space *mapping)
+{
+ return errseq_sample(&mapping->wb_err);
+}
+
extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end,
int datasync);
extern int vfs_fsync(struct file *file, int datasync);
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index e7bbd9d..baa9344 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -28,14 +28,33 @@ enum mapping_flags {
AS_NO_WRITEBACK_TAGS = 5,
};
+/**
+ * mapping_set_error - record a writeback error in the address_space
+ * @mapping - the mapping in which an error should be set
+ * @error - the error to set in the mapping
+ *
+ * When writeback fails in some way, we must record that error so that
+ * userspace can be informed when fsync and the like are called. We endeavor
+ * to report errors on any file that was open at the time of the error. Some
+ * internal callers also need to know when writeback errors have occurred.
+ *
+ * When a writeback error occurs, most filesystems will want to call
+ * mapping_set_error to record the error in the mapping so that it can be
+ * reported when the application calls fsync(2).
+ */
static inline void mapping_set_error(struct address_space *mapping, int error)
{
- if (unlikely(error)) {
- if (error == -ENOSPC)
- set_bit(AS_ENOSPC, &mapping->flags);
- else
- set_bit(AS_EIO, &mapping->flags);
- }
+ if (likely(!error))
+ return;
+
+ /* Record in wb_err for checkers using errseq_t based tracking */
+ filemap_set_wb_err(mapping, error);
+
+ /* Record it in flags for now, for legacy callers */
+ if (error == -ENOSPC)
+ set_bit(AS_ENOSPC, &mapping->flags);
+ else
+ set_bit(AS_EIO, &mapping->flags);
}
static inline void mapping_set_unevictable(struct address_space *mapping)
diff --git a/include/trace/events/filemap.h b/include/trace/events/filemap.h
index 42febb6..ff91325 100644
--- a/include/trace/events/filemap.h
+++ b/include/trace/events/filemap.h
@@ -10,6 +10,7 @@
#include <linux/memcontrol.h>
#include <linux/device.h>
#include <linux/kdev_t.h>
+#include <linux/errseq.h>
DECLARE_EVENT_CLASS(mm_filemap_op_page_cache,
@@ -52,6 +53,62 @@ DEFINE_EVENT(mm_filemap_op_page_cache, mm_filemap_add_to_page_cache,
TP_ARGS(page)
);
+TRACE_EVENT(filemap_set_wb_err,
+ TP_PROTO(struct address_space *mapping, errseq_t eseq),
+
+ TP_ARGS(mapping, eseq),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, i_ino)
+ __field(dev_t, s_dev)
+ __field(errseq_t, errseq)
+ ),
+
+ TP_fast_assign(
+ __entry->i_ino = mapping->host->i_ino;
+ __entry->errseq = eseq;
+ if (mapping->host->i_sb)
+ __entry->s_dev = mapping->host->i_sb->s_dev;
+ else
+ __entry->s_dev = mapping->host->i_rdev;
+ ),
+
+ TP_printk("dev=%d:%d ino=0x%lx errseq=0x%x",
+ MAJOR(__entry->s_dev), MINOR(__entry->s_dev),
+ __entry->i_ino, __entry->errseq)
+);
+
+TRACE_EVENT(file_check_and_advance_wb_err,
+ TP_PROTO(struct file *file, errseq_t old),
+
+ TP_ARGS(file, old),
+
+ TP_STRUCT__entry(
+ __field(struct file *, file);
+ __field(unsigned long, i_ino)
+ __field(dev_t, s_dev)
+ __field(errseq_t, old)
+ __field(errseq_t, new)
+ ),
+
+ TP_fast_assign(
+ __entry->file = file;
+ __entry->i_ino = file->f_mapping->host->i_ino;
+ if (file->f_mapping->host->i_sb)
+ __entry->s_dev =
+ file->f_mapping->host->i_sb->s_dev;
+ else
+ __entry->s_dev =
+ file->f_mapping->host->i_rdev;
+ __entry->old = old;
+ __entry->new = file->f_wb_err;
+ ),
+
+ TP_printk("file=%p dev=%d:%d ino=0x%lx old=0x%x new=0x%x",
+ __entry->file, MAJOR(__entry->s_dev),
+ MINOR(__entry->s_dev), __entry->i_ino, __entry->old,
+ __entry->new)
+);
#endif /* _TRACE_FILEMAP_H */
/* This part must be outside protection */
diff --git a/lib/Makefile b/lib/Makefile
index 7fb6ab7..5a00832 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -38,7 +38,7 @@ obj-y += bcd.o div64.o sort.o parser.o debug_locks.o random32.o \
gcd.o lcm.o list_sort.o uuid.o flex_array.o iov_iter.o clz_ctz.o \
bsearch.o find_bit.o llist.o memweight.o kfifo.o \
percpu-refcount.o percpu_ida.o rhashtable.o reciprocal_div.o \
- once.o refcount.o usercopy.o
+ once.o refcount.o usercopy.o errseq.o
obj-y += string_helpers.o
obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
obj-y += hexdump.o
diff --git a/lib/errseq.c b/lib/errseq.c
new file mode 100644
index 0000000..841fa24
--- /dev/null
+++ b/lib/errseq.c
@@ -0,0 +1,208 @@
+#include <linux/err.h>
+#include <linux/bug.h>
+#include <linux/atomic.h>
+#include <linux/errseq.h>
+
+/*
+ * An errseq_t is a way of recording errors in one place, and allowing any
+ * number of "subscribers" to tell whether it has changed since a previous
+ * point where it was sampled.
+ *
+ * It's implemented as an unsigned 32-bit value. The low order bits are
+ * designated to hold an error code (between 0 and -MAX_ERRNO). The upper bits
+ * are used as a counter. This is done with atomics instead of locking so that
+ * these functions can be called from any context.
+ *
+ * The general idea is for consumers to sample an errseq_t value. That value
+ * can later be used to tell whether any new errors have occurred since that
+ * sampling was done.
+ *
+ * Note that there is a risk of collisions if new errors are being recorded
+ * frequently, since we have so few bits to use as a counter.
+ *
+ * To mitigate this, one bit is used as a flag to tell whether the value has
+ * been sampled since a new value was recorded. That allows us to avoid bumping
+ * the counter if no one has sampled it since the last time an error was
+ * recorded.
+ *
+ * A new errseq_t should always be zeroed out. A errseq_t value of all zeroes
+ * is the special (but common) case where there has never been an error. An all
+ * zero value thus serves as the "epoch" if one wishes to know whether there
+ * has ever been an error set since it was first initialized.
+ */
+
+/* The low bits are designated for error code (max of MAX_ERRNO) */
+#define ERRSEQ_SHIFT ilog2(MAX_ERRNO + 1)
+
+/* This bit is used as a flag to indicate whether the value has been seen */
+#define ERRSEQ_SEEN (1 << ERRSEQ_SHIFT)
+
+/* The lowest bit of the counter */
+#define ERRSEQ_CTR_INC (1 << (ERRSEQ_SHIFT + 1))
+
+/**
+ * __errseq_set - set a errseq_t for later reporting
+ * @eseq: errseq_t field that should be set
+ * @err: error to set
+ *
+ * This function sets the error in *eseq, and increments the sequence counter
+ * if the last sequence was sampled at some point in the past.
+ *
+ * Any error set will always overwrite an existing error.
+ *
+ * Most callers will want to use the errseq_set inline wrapper to efficiently
+ * handle the common case where err is 0.
+ *
+ * We do return an errseq_t here, primarily for debugging purposes. The return
+ * value should not be used as a previously sampled value in later calls as it
+ * will not have the SEEN flag set.
+ */
+errseq_t __errseq_set(errseq_t *eseq, int err)
+{
+ errseq_t cur, old;
+
+ /* MAX_ERRNO must be able to serve as a mask */
+ BUILD_BUG_ON_NOT_POWER_OF_2(MAX_ERRNO + 1);
+
+ /*
+ * Ensure the error code actually fits where we want it to go. If it
+ * doesn't then just throw a warning and don't record anything. We
+ * also don't accept zero here as that would effectively clear a
+ * previous error.
+ */
+ old = READ_ONCE(*eseq);
+
+ if (WARN(unlikely(err == 0 || (unsigned int)-err > MAX_ERRNO),
+ "err = %d\n", err))
+ return old;
+
+ for (;;) {
+ errseq_t new;
+
+ /* Clear out error bits and set new error */
+ new = (old & ~(MAX_ERRNO|ERRSEQ_SEEN)) | -err;
+
+ /* Only increment if someone has looked at it */
+ if (old & ERRSEQ_SEEN)
+ new += ERRSEQ_CTR_INC;
+
+ /* If there would be no change, then call it done */
+ if (new == old) {
+ cur = new;
+ break;
+ }
+
+ /* Try to swap the new value into place */
+ cur = cmpxchg(eseq, old, new);
+
+ /*
+ * Call it success if we did the swap or someone else beat us
+ * to it for the same value.
+ */
+ if (likely(cur == old || cur == new))
+ break;
+
+ /* Raced with an update, try again */
+ old = cur;
+ }
+ return cur;
+}
+EXPORT_SYMBOL(__errseq_set);
+
+/**
+ * errseq_sample - grab current errseq_t value
+ * @eseq: pointer to errseq_t to be sampled
+ *
+ * This function allows callers to sample an errseq_t value, marking it as
+ * "seen" if required.
+ */
+errseq_t errseq_sample(errseq_t *eseq)
+{
+ errseq_t old = READ_ONCE(*eseq);
+ errseq_t new = old;
+
+ /*
+ * For the common case of no errors ever having been set, we can skip
+ * marking the SEEN bit. Once an error has been set, the value will
+ * never go back to zero.
+ */
+ if (old != 0) {
+ new |= ERRSEQ_SEEN;
+ if (old != new)
+ cmpxchg(eseq, old, new);
+ }
+ return new;
+}
+EXPORT_SYMBOL(errseq_sample);
+
+/**
+ * errseq_check - has an error occurred since a particular sample point?
+ * @eseq: pointer to errseq_t value to be checked
+ * @since: previously-sampled errseq_t from which to check
+ *
+ * Grab the value that eseq points to, and see if it has changed "since"
+ * the given value was sampled. The "since" value is not advanced, so there
+ * is no need to mark the value as seen.
+ *
+ * Returns the latest error set in the errseq_t or 0 if it hasn't changed.
+ */
+int errseq_check(errseq_t *eseq, errseq_t since)
+{
+ errseq_t cur = READ_ONCE(*eseq);
+
+ if (likely(cur == since))
+ return 0;
+ return -(cur & MAX_ERRNO);
+}
+EXPORT_SYMBOL(errseq_check);
+
+/**
+ * errseq_check_and_advance - check an errseq_t and advance to current value
+ * @eseq: pointer to value being checked and reported
+ * @since: pointer to previously-sampled errseq_t to check against and advance
+ *
+ * Grab the eseq value, and see whether it matches the value that "since"
+ * points to. If it does, then just return 0.
+ *
+ * If it doesn't, then the value has changed. Set the "seen" flag, and try to
+ * swap it into place as the new eseq value. Then, set that value as the new
+ * "since" value, and return whatever the error portion is set to.
+ *
+ * Note that no locking is provided here for concurrent updates to the "since"
+ * value. The caller must provide that if necessary. Because of this, callers
+ * may want to do a lockless errseq_check before taking the lock and calling
+ * this.
+ */
+int errseq_check_and_advance(errseq_t *eseq, errseq_t *since)
+{
+ int err = 0;
+ errseq_t old, new;
+
+ /*
+ * Most callers will want to use the inline wrapper to check this,
+ * so that the common case of no error is handled without needing
+ * to take the lock that protects the "since" value.
+ */
+ old = READ_ONCE(*eseq);
+ if (old != *since) {
+ /*
+ * Set the flag and try to swap it into place if it has
+ * changed.
+ *
+ * We don't care about the outcome of the swap here. If the
+ * swap doesn't occur, then it has either been updated by a
+ * writer who is altering the value in some way (updating
+ * counter or resetting the error), or another reader who is
+ * just setting the "seen" flag. Either outcome is OK, and we
+ * can advance "since" and return an error based on what we
+ * have.
+ */
+ new = old | ERRSEQ_SEEN;
+ if (new != old)
+ cmpxchg(eseq, old, new);
+ *since = new;
+ err = -(new & MAX_ERRNO);
+ }
+ return err;
+}
+EXPORT_SYMBOL(errseq_check_and_advance);
diff --git a/mm/filemap.c b/mm/filemap.c
index 2e906ef..3247b42 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -309,6 +309,16 @@ int filemap_check_errors(struct address_space *mapping)
}
EXPORT_SYMBOL(filemap_check_errors);
+static int filemap_check_and_keep_errors(struct address_space *mapping)
+{
+ /* Check for outstanding write errors */
+ if (test_bit(AS_EIO, &mapping->flags))
+ return -EIO;
+ if (test_bit(AS_ENOSPC, &mapping->flags))
+ return -ENOSPC;
+ return 0;
+}
+
/**
* __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
* @mapping: address space structure to write
@@ -408,17 +418,16 @@ bool filemap_range_has_page(struct address_space *mapping,
}
EXPORT_SYMBOL(filemap_range_has_page);
-static int __filemap_fdatawait_range(struct address_space *mapping,
+static void __filemap_fdatawait_range(struct address_space *mapping,
loff_t start_byte, loff_t end_byte)
{
pgoff_t index = start_byte >> PAGE_SHIFT;
pgoff_t end = end_byte >> PAGE_SHIFT;
struct pagevec pvec;
int nr_pages;
- int ret = 0;
if (end_byte < start_byte)
- goto out;
+ return;
pagevec_init(&pvec, 0);
while ((index <= end) &&
@@ -435,14 +444,11 @@ static int __filemap_fdatawait_range(struct address_space *mapping,
continue;
wait_on_page_writeback(page);
- if (TestClearPageError(page))
- ret = -EIO;
+ ClearPageError(page);
}
pagevec_release(&pvec);
cond_resched();
}
-out:
- return ret;
}
/**
@@ -462,14 +468,8 @@ out:
int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
loff_t end_byte)
{
- int ret, ret2;
-
- ret = __filemap_fdatawait_range(mapping, start_byte, end_byte);
- ret2 = filemap_check_errors(mapping);
- if (!ret)
- ret = ret2;
-
- return ret;
+ __filemap_fdatawait_range(mapping, start_byte, end_byte);
+ return filemap_check_errors(mapping);
}
EXPORT_SYMBOL(filemap_fdatawait_range);
@@ -485,15 +485,17 @@ EXPORT_SYMBOL(filemap_fdatawait_range);
* call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
* fsfreeze(8)
*/
-void filemap_fdatawait_keep_errors(struct address_space *mapping)
+int filemap_fdatawait_keep_errors(struct address_space *mapping)
{
loff_t i_size = i_size_read(mapping->host);
if (i_size == 0)
- return;
+ return 0;
__filemap_fdatawait_range(mapping, 0, i_size - 1);
+ return filemap_check_and_keep_errors(mapping);
}
+EXPORT_SYMBOL(filemap_fdatawait_keep_errors);
/**
* filemap_fdatawait - wait for all under-writeback pages to complete
@@ -535,6 +537,9 @@ int filemap_write_and_wait(struct address_space *mapping)
int err2 = filemap_fdatawait(mapping);
if (!err)
err = err2;
+ } else {
+ /* Clear any previously stored errors */
+ filemap_check_errors(mapping);
}
} else {
err = filemap_check_errors(mapping);
@@ -569,6 +574,9 @@ int filemap_write_and_wait_range(struct address_space *mapping,
lstart, lend);
if (!err)
err = err2;
+ } else {
+ /* Clear any previously stored errors */
+ filemap_check_errors(mapping);
}
} else {
err = filemap_check_errors(mapping);
@@ -577,6 +585,90 @@ int filemap_write_and_wait_range(struct address_space *mapping,
}
EXPORT_SYMBOL(filemap_write_and_wait_range);
+void __filemap_set_wb_err(struct address_space *mapping, int err)
+{
+ errseq_t eseq = __errseq_set(&mapping->wb_err, err);
+
+ trace_filemap_set_wb_err(mapping, eseq);
+}
+EXPORT_SYMBOL(__filemap_set_wb_err);
+
+/**
+ * file_check_and_advance_wb_err - report wb error (if any) that was previously
+ * and advance wb_err to current one
+ * @file: struct file on which the error is being reported
+ *
+ * When userland calls fsync (or something like nfsd does the equivalent), we
+ * want to report any writeback errors that occurred since the last fsync (or
+ * since the file was opened if there haven't been any).
+ *
+ * Grab the wb_err from the mapping. If it matches what we have in the file,
+ * then just quickly return 0. The file is all caught up.
+ *
+ * If it doesn't match, then take the mapping value, set the "seen" flag in
+ * it and try to swap it into place. If it works, or another task beat us
+ * to it with the new value, then update the f_wb_err and return the error
+ * portion. The error at this point must be reported via proper channels
+ * (a'la fsync, or NFS COMMIT operation, etc.).
+ *
+ * While we handle mapping->wb_err with atomic operations, the f_wb_err
+ * value is protected by the f_lock since we must ensure that it reflects
+ * the latest value swapped in for this file descriptor.
+ */
+int file_check_and_advance_wb_err(struct file *file)
+{
+ int err = 0;
+ errseq_t old = READ_ONCE(file->f_wb_err);
+ struct address_space *mapping = file->f_mapping;
+
+ /* Locklessly handle the common case where nothing has changed */
+ if (errseq_check(&mapping->wb_err, old)) {
+ /* Something changed, must use slow path */
+ spin_lock(&file->f_lock);
+ old = file->f_wb_err;
+ err = errseq_check_and_advance(&mapping->wb_err,
+ &file->f_wb_err);
+ trace_file_check_and_advance_wb_err(file, old);
+ spin_unlock(&file->f_lock);
+ }
+ return err;
+}
+EXPORT_SYMBOL(file_check_and_advance_wb_err);
+
+/**
+ * file_write_and_wait_range - write out & wait on a file range
+ * @file: file pointing to address_space with pages
+ * @lstart: offset in bytes where the range starts
+ * @lend: offset in bytes where the range ends (inclusive)
+ *
+ * Write out and wait upon file offsets lstart->lend, inclusive.
+ *
+ * Note that @lend is inclusive (describes the last byte to be written) so
+ * that this function can be used to write to the very end-of-file (end = -1).
+ *
+ * After writing out and waiting on the data, we check and advance the
+ * f_wb_err cursor to the latest value, and return any errors detected there.
+ */
+int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend)
+{
+ int err = 0, err2;
+ struct address_space *mapping = file->f_mapping;
+
+ if ((!dax_mapping(mapping) && mapping->nrpages) ||
+ (dax_mapping(mapping) && mapping->nrexceptional)) {
+ err = __filemap_fdatawrite_range(mapping, lstart, lend,
+ WB_SYNC_ALL);
+ /* See comment of filemap_write_and_wait() */
+ if (err != -EIO)
+ __filemap_fdatawait_range(mapping, lstart, lend);
+ }
+ err2 = file_check_and_advance_wb_err(file);
+ if (!err)
+ err = err2;
+ return err;
+}
+EXPORT_SYMBOL(file_write_and_wait_range);
+
/**
* replace_page_cache_page - replace a pagecache page with a new one
* @old: page to be replaced
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index a74c831..dbe3e50 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -684,7 +684,7 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn)
* the first EIO, but we're not worse than other parts
* of the kernel.
*/
- mapping_set_error(mapping, EIO);
+ mapping_set_error(mapping, -EIO);
}
return me_pagecache_clean(p, pfn);
OpenPOWER on IntegriCloud