diff options
-rw-r--r-- | Documentation/filesystems/vfs.txt | 44 | ||||
-rw-r--r-- | MAINTAINERS | 6 | ||||
-rw-r--r-- | drivers/dax/device.c | 1 | ||||
-rw-r--r-- | fs/block_dev.c | 3 | ||||
-rw-r--r-- | fs/btrfs/file.c | 13 | ||||
-rw-r--r-- | fs/buffer.c | 20 | ||||
-rw-r--r-- | fs/dax.c | 4 | ||||
-rw-r--r-- | fs/ext2/file.c | 5 | ||||
-rw-r--r-- | fs/ext4/fsync.c | 2 | ||||
-rw-r--r-- | fs/file_table.c | 1 | ||||
-rw-r--r-- | fs/gfs2/lops.c | 2 | ||||
-rw-r--r-- | fs/jbd2/commit.c | 16 | ||||
-rw-r--r-- | fs/libfs.c | 6 | ||||
-rw-r--r-- | fs/open.c | 3 | ||||
-rw-r--r-- | fs/xfs/xfs_file.c | 2 | ||||
-rw-r--r-- | include/linux/buffer_head.h | 1 | ||||
-rw-r--r-- | include/linux/errseq.h | 19 | ||||
-rw-r--r-- | include/linux/fs.h | 61 | ||||
-rw-r--r-- | include/linux/pagemap.h | 31 | ||||
-rw-r--r-- | include/trace/events/filemap.h | 57 | ||||
-rw-r--r-- | lib/Makefile | 2 | ||||
-rw-r--r-- | lib/errseq.c | 208 | ||||
-rw-r--r-- | mm/filemap.c | 126 | ||||
-rw-r--r-- | mm/memory-failure.c | 2 |
24 files changed, 572 insertions, 63 deletions
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index f42b906..48c9faa 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -576,7 +576,43 @@ should clear PG_Dirty and set PG_Writeback. It can be actually written at any point after PG_Dirty is clear. Once it is known to be safe, PG_Writeback is cleared. -Writeback makes use of a writeback_control structure... +Writeback makes use of a writeback_control structure to direct the +operations. This gives the the writepage and writepages operations some +information about the nature of and reason for the writeback request, +and the constraints under which it is being done. It is also used to +return information back to the caller about the result of a writepage or +writepages request. + +Handling errors during writeback +-------------------------------- +Most applications that do buffered I/O will periodically call a file +synchronization call (fsync, fdatasync, msync or sync_file_range) to +ensure that data written has made it to the backing store. When there +is an error during writeback, they expect that error to be reported when +a file sync request is made. After an error has been reported on one +request, subsequent requests on the same file descriptor should return +0, unless further writeback errors have occurred since the previous file +syncronization. + +Ideally, the kernel would report errors only on file descriptions on +which writes were done that subsequently failed to be written back. The +generic pagecache infrastructure does not track the file descriptions +that have dirtied each individual page however, so determining which +file descriptors should get back an error is not possible. + +Instead, the generic writeback error tracking infrastructure in the +kernel settles for reporting errors to fsync on all file descriptions +that were open at the time that the error occurred. In a situation with +multiple writers, all of them will get back an error on a subsequent fsync, +even if all of the writes done through that particular file descriptor +succeeded (or even if there were no writes on that file descriptor at all). + +Filesystems that wish to use this infrastructure should call +mapping_set_error to record the error in the address_space when it +occurs. Then, after writing back data from the pagecache in their +file->fsync operation, they should call file_check_and_advance_wb_err to +ensure that the struct file's error cursor has advanced to the correct +point in the stream of errors emitted by the backing device(s). struct address_space_operations ------------------------------- @@ -804,7 +840,8 @@ struct address_space_operations { The File Object =============== -A file object represents a file opened by a process. +A file object represents a file opened by a process. This is also known +as an "open file description" in POSIX parlance. struct file_operations @@ -887,7 +924,8 @@ otherwise noted. release: called when the last reference to an open file is closed - fsync: called by the fsync(2) system call + fsync: called by the fsync(2) system call. Also see the section above + entitled "Handling errors during writeback". fasync: called by the fcntl(2) system call when asynchronous (non-blocking) mode is enabled for a file diff --git a/MAINTAINERS b/MAINTAINERS index b31be75..71d4380 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5069,6 +5069,12 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/kristoffer/linux-hpc.git F: drivers/video/fbdev/s1d13xxxfb.c F: include/video/s1d13xxxfb.h +ERRSEQ ERROR TRACKING INFRASTRUCTURE +M: Jeff Layton <jlayton@poochiereds.net> +S: Maintained +F: lib/errseq.c +F: include/linux/errseq.h + ET131X NETWORK DRIVER M: Mark Einon <mark.einon@gmail.com> S: Odd Fixes diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 006e657..12943d1 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -499,6 +499,7 @@ static int dax_open(struct inode *inode, struct file *filp) inode->i_mapping = __dax_inode->i_mapping; inode->i_mapping->host = __dax_inode; filp->f_mapping = inode->i_mapping; + filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping); filp->private_data = dev_dax; inode->i_flags = S_DAX; diff --git a/fs/block_dev.c b/fs/block_dev.c index a7df151..9941dc8 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -632,7 +632,7 @@ int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) struct block_device *bdev = I_BDEV(bd_inode); int error; - error = filemap_write_and_wait_range(filp->f_mapping, start, end); + error = file_write_and_wait_range(filp, start, end); if (error) return error; @@ -1751,6 +1751,7 @@ static int blkdev_open(struct inode * inode, struct file * filp) return -ENOMEM; filp->f_mapping = bdev->bd_inode->i_mapping; + filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping); return blkdev_get(bdev, filp->f_mode, filp); } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 2433870..a85d790 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2032,7 +2032,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; struct btrfs_log_ctx ctx; - int ret = 0; + int ret = 0, err; bool full_sync = 0; u64 len; @@ -2051,7 +2051,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ ret = start_ordered_ops(inode, start, end); if (ret) - return ret; + goto out; inode_lock(inode); atomic_inc(&root->log_batch); @@ -2156,10 +2156,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * An ordered extent might have started before and completed * already with io errors, in which case the inode was not * updated and we end up here. So check the inode's mapping - * flags for any errors that might have happened while doing - * writeback of file data. + * for any errors that might have happened since we last + * checked called fsync. */ - ret = filemap_check_errors(inode->i_mapping); + ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err); inode_unlock(inode); goto out; } @@ -2248,6 +2248,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) ret = btrfs_end_transaction(trans); } out: + err = file_check_and_advance_wb_err(file); + if (!ret) + ret = err; return ret > 0 ? -EIO : ret; } diff --git a/fs/buffer.c b/fs/buffer.c index 5c2cba8..5234b15 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -178,7 +178,7 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate) set_buffer_uptodate(bh); } else { buffer_io_error(bh, ", lost sync page write"); - set_buffer_write_io_error(bh); + mark_buffer_write_io_error(bh); clear_buffer_uptodate(bh); } unlock_buffer(bh); @@ -352,8 +352,7 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate) set_buffer_uptodate(bh); } else { buffer_io_error(bh, ", lost async page write"); - mapping_set_error(page->mapping, -EIO); - set_buffer_write_io_error(bh); + mark_buffer_write_io_error(bh); clear_buffer_uptodate(bh); SetPageError(page); } @@ -481,8 +480,6 @@ static void __remove_assoc_queue(struct buffer_head *bh) { list_del_init(&bh->b_assoc_buffers); WARN_ON(!bh->b_assoc_map); - if (buffer_write_io_error(bh)) - set_bit(AS_EIO, &bh->b_assoc_map->flags); bh->b_assoc_map = NULL; } @@ -1181,6 +1178,17 @@ void mark_buffer_dirty(struct buffer_head *bh) } EXPORT_SYMBOL(mark_buffer_dirty); +void mark_buffer_write_io_error(struct buffer_head *bh) +{ + set_buffer_write_io_error(bh); + /* FIXME: do we need to set this in both places? */ + if (bh->b_page && bh->b_page->mapping) + mapping_set_error(bh->b_page->mapping, -EIO); + if (bh->b_assoc_map) + mapping_set_error(bh->b_assoc_map, -EIO); +} +EXPORT_SYMBOL(mark_buffer_write_io_error); + /* * Decrement a buffer_head's reference count. If all buffers against a page * have zero reference count, are clean and unlocked, and if the page is clean @@ -3282,8 +3290,6 @@ drop_buffers(struct page *page, struct buffer_head **buffers_to_free) bh = head; do { - if (buffer_write_io_error(bh) && page->mapping) - mapping_set_error(page->mapping, -EIO); if (buffer_busy(bh)) goto failed; bh = bh->b_this_page; @@ -855,8 +855,10 @@ int dax_writeback_mapping_range(struct address_space *mapping, ret = dax_writeback_one(bdev, dax_dev, mapping, indices[i], pvec.pages[i]); - if (ret < 0) + if (ret < 0) { + mapping_set_error(mapping, ret); goto out; + } } start_index = indices[pvec.nr - 1] + 1; } diff --git a/fs/ext2/file.c b/fs/ext2/file.c index b21891a..d34d32b 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -174,15 +174,12 @@ int ext2_fsync(struct file *file, loff_t start, loff_t end, int datasync) { int ret; struct super_block *sb = file->f_mapping->host->i_sb; - struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping; ret = generic_file_fsync(file, start, end, datasync); - if (ret == -EIO || test_and_clear_bit(AS_EIO, &mapping->flags)) { + if (ret == -EIO) /* We don't really know where the IO error happened... */ ext2_error(sb, __func__, "detected IO error when writing metadata buffers"); - ret = -EIO; - } return ret; } diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 9d54960..aae2c39 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -124,7 +124,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) goto out; } - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + ret = file_write_and_wait_range(file, start, end); if (ret) return ret; /* diff --git a/fs/file_table.c b/fs/file_table.c index 954d510..72e861a 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -168,6 +168,7 @@ struct file *alloc_file(const struct path *path, fmode_t mode, file->f_path = *path; file->f_inode = path->dentry->d_inode; file->f_mapping = path->dentry->d_inode->i_mapping; + file->f_wb_err = filemap_sample_wb_err(file->f_mapping); if ((mode & FMODE_READ) && likely(fop->read || fop->read_iter)) mode |= FMODE_CAN_READ; diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index e5259cd..3010f9e 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -180,7 +180,7 @@ static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, struct bio_vec *bvec, bh = bh->b_this_page; do { if (error) - set_buffer_write_io_error(bh); + mark_buffer_write_io_error(bh); unlock_buffer(bh); next = bh->b_this_page; size -= bh->b_size; diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index b6b194e..3c1c313 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -263,18 +263,10 @@ static int journal_finish_inode_data_buffers(journal_t *journal, continue; jinode->i_flags |= JI_COMMIT_RUNNING; spin_unlock(&journal->j_list_lock); - err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping); - if (err) { - /* - * Because AS_EIO is cleared by - * filemap_fdatawait_range(), set it again so - * that user process can get -EIO from fsync(). - */ - mapping_set_error(jinode->i_vfs_inode->i_mapping, -EIO); - - if (!ret) - ret = err; - } + err = filemap_fdatawait_keep_errors( + jinode->i_vfs_inode->i_mapping); + if (!ret) + ret = err; spin_lock(&journal->j_list_lock); jinode->i_flags &= ~JI_COMMIT_RUNNING; smp_mb(); @@ -974,7 +974,7 @@ int __generic_file_fsync(struct file *file, loff_t start, loff_t end, int err; int ret; - err = filemap_write_and_wait_range(inode->i_mapping, start, end); + err = file_write_and_wait_range(file, start, end); if (err) return err; @@ -991,6 +991,10 @@ int __generic_file_fsync(struct file *file, loff_t start, loff_t end, out: inode_unlock(inode); + /* check and advance again to catch errors after syncing out buffers */ + err = file_check_and_advance_wb_err(file); + if (ret == 0) + ret = err; return ret; } EXPORT_SYMBOL(__generic_file_fsync); @@ -707,6 +707,9 @@ static int do_dentry_open(struct file *f, f->f_inode = inode; f->f_mapping = inode->i_mapping; + /* Ensure that we skip any errors that predate opening of the file */ + f->f_wb_err = filemap_sample_wb_err(f->f_mapping); + if (unlikely(f->f_flags & O_PATH)) { f->f_mode = FMODE_PATH; f->f_op = &empty_fops; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 17f27a2..51dfae5 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -140,7 +140,7 @@ xfs_file_fsync( trace_xfs_file_fsync(ip); - error = filemap_write_and_wait_range(inode->i_mapping, start, end); + error = file_write_and_wait_range(file, start, end); if (error) return error; diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index bd029e52..e0abeba 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -149,6 +149,7 @@ void buffer_check_dirty_writeback(struct page *page, */ void mark_buffer_dirty(struct buffer_head *bh); +void mark_buffer_write_io_error(struct buffer_head *bh); void init_buffer(struct buffer_head *, bh_end_io_t *, void *); void touch_buffer(struct buffer_head *bh); void set_bh_page(struct buffer_head *bh, diff --git a/include/linux/errseq.h b/include/linux/errseq.h new file mode 100644 index 0000000..9e0d444 --- /dev/null +++ b/include/linux/errseq.h @@ -0,0 +1,19 @@ +#ifndef _LINUX_ERRSEQ_H +#define _LINUX_ERRSEQ_H + +/* See lib/errseq.c for more info */ + +typedef u32 errseq_t; + +errseq_t __errseq_set(errseq_t *eseq, int err); +static inline void errseq_set(errseq_t *eseq, int err) +{ + /* Optimize for the common case of no error */ + if (unlikely(err)) + __errseq_set(eseq, err); +} + +errseq_t errseq_sample(errseq_t *eseq); +int errseq_check(errseq_t *eseq, errseq_t since); +int errseq_check_and_advance(errseq_t *eseq, errseq_t *since); +#endif diff --git a/include/linux/fs.h b/include/linux/fs.h index 978fb59..0cfa471 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -32,6 +32,7 @@ #include <linux/workqueue.h> #include <linux/delayed_call.h> #include <linux/uuid.h> +#include <linux/errseq.h> #include <asm/byteorder.h> #include <uapi/linux/fs.h> @@ -401,6 +402,7 @@ struct address_space { gfp_t gfp_mask; /* implicit gfp mask for allocations */ struct list_head private_list; /* ditto */ void *private_data; /* ditto */ + errseq_t wb_err; } __attribute__((aligned(sizeof(long)))); /* * On most architectures that alignment is already the case; but @@ -879,6 +881,7 @@ struct file { struct list_head f_tfile_llink; #endif /* #ifdef CONFIG_EPOLL */ struct address_space *f_mapping; + errseq_t f_wb_err; } __attribute__((aligned(4))); /* lest something weird decides that 2 is OK */ struct file_handle { @@ -2536,7 +2539,7 @@ extern int write_inode_now(struct inode *, int); extern int filemap_fdatawrite(struct address_space *); extern int filemap_flush(struct address_space *); extern int filemap_fdatawait(struct address_space *); -extern void filemap_fdatawait_keep_errors(struct address_space *); +extern int filemap_fdatawait_keep_errors(struct address_space *mapping); extern int filemap_fdatawait_range(struct address_space *, loff_t lstart, loff_t lend); extern bool filemap_range_has_page(struct address_space *, loff_t lstart, @@ -2550,6 +2553,62 @@ extern int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end); extern int filemap_check_errors(struct address_space *mapping); +extern void __filemap_set_wb_err(struct address_space *mapping, int err); +extern int __must_check file_check_and_advance_wb_err(struct file *file); +extern int __must_check file_write_and_wait_range(struct file *file, + loff_t start, loff_t end); + +/** + * filemap_set_wb_err - set a writeback error on an address_space + * @mapping: mapping in which to set writeback error + * @err: error to be set in mapping + * + * When writeback fails in some way, we must record that error so that + * userspace can be informed when fsync and the like are called. We endeavor + * to report errors on any file that was open at the time of the error. Some + * internal callers also need to know when writeback errors have occurred. + * + * When a writeback error occurs, most filesystems will want to call + * filemap_set_wb_err to record the error in the mapping so that it will be + * automatically reported whenever fsync is called on the file. + * + * FIXME: mention FS_* flag here? + */ +static inline void filemap_set_wb_err(struct address_space *mapping, int err) +{ + /* Fastpath for common case of no error */ + if (unlikely(err)) + __filemap_set_wb_err(mapping, err); +} + +/** + * filemap_check_wb_error - has an error occurred since the mark was sampled? + * @mapping: mapping to check for writeback errors + * @since: previously-sampled errseq_t + * + * Grab the errseq_t value from the mapping, and see if it has changed "since" + * the given value was sampled. + * + * If it has then report the latest error set, otherwise return 0. + */ +static inline int filemap_check_wb_err(struct address_space *mapping, + errseq_t since) +{ + return errseq_check(&mapping->wb_err, since); +} + +/** + * filemap_sample_wb_err - sample the current errseq_t to test for later errors + * @mapping: mapping to be sampled + * + * Writeback errors are always reported relative to a particular sample point + * in the past. This function provides those sample points. + */ +static inline errseq_t filemap_sample_wb_err(struct address_space *mapping) +{ + return errseq_sample(&mapping->wb_err); +} + extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync); extern int vfs_fsync(struct file *file, int datasync); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index e7bbd9d..baa9344 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -28,14 +28,33 @@ enum mapping_flags { AS_NO_WRITEBACK_TAGS = 5, }; +/** + * mapping_set_error - record a writeback error in the address_space + * @mapping - the mapping in which an error should be set + * @error - the error to set in the mapping + * + * When writeback fails in some way, we must record that error so that + * userspace can be informed when fsync and the like are called. We endeavor + * to report errors on any file that was open at the time of the error. Some + * internal callers also need to know when writeback errors have occurred. + * + * When a writeback error occurs, most filesystems will want to call + * mapping_set_error to record the error in the mapping so that it can be + * reported when the application calls fsync(2). + */ static inline void mapping_set_error(struct address_space *mapping, int error) { - if (unlikely(error)) { - if (error == -ENOSPC) - set_bit(AS_ENOSPC, &mapping->flags); - else - set_bit(AS_EIO, &mapping->flags); - } + if (likely(!error)) + return; + + /* Record in wb_err for checkers using errseq_t based tracking */ + filemap_set_wb_err(mapping, error); + + /* Record it in flags for now, for legacy callers */ + if (error == -ENOSPC) + set_bit(AS_ENOSPC, &mapping->flags); + else + set_bit(AS_EIO, &mapping->flags); } static inline void mapping_set_unevictable(struct address_space *mapping) diff --git a/include/trace/events/filemap.h b/include/trace/events/filemap.h index 42febb6..ff91325 100644 --- a/include/trace/events/filemap.h +++ b/include/trace/events/filemap.h @@ -10,6 +10,7 @@ #include <linux/memcontrol.h> #include <linux/device.h> #include <linux/kdev_t.h> +#include <linux/errseq.h> DECLARE_EVENT_CLASS(mm_filemap_op_page_cache, @@ -52,6 +53,62 @@ DEFINE_EVENT(mm_filemap_op_page_cache, mm_filemap_add_to_page_cache, TP_ARGS(page) ); +TRACE_EVENT(filemap_set_wb_err, + TP_PROTO(struct address_space *mapping, errseq_t eseq), + + TP_ARGS(mapping, eseq), + + TP_STRUCT__entry( + __field(unsigned long, i_ino) + __field(dev_t, s_dev) + __field(errseq_t, errseq) + ), + + TP_fast_assign( + __entry->i_ino = mapping->host->i_ino; + __entry->errseq = eseq; + if (mapping->host->i_sb) + __entry->s_dev = mapping->host->i_sb->s_dev; + else + __entry->s_dev = mapping->host->i_rdev; + ), + + TP_printk("dev=%d:%d ino=0x%lx errseq=0x%x", + MAJOR(__entry->s_dev), MINOR(__entry->s_dev), + __entry->i_ino, __entry->errseq) +); + +TRACE_EVENT(file_check_and_advance_wb_err, + TP_PROTO(struct file *file, errseq_t old), + + TP_ARGS(file, old), + + TP_STRUCT__entry( + __field(struct file *, file); + __field(unsigned long, i_ino) + __field(dev_t, s_dev) + __field(errseq_t, old) + __field(errseq_t, new) + ), + + TP_fast_assign( + __entry->file = file; + __entry->i_ino = file->f_mapping->host->i_ino; + if (file->f_mapping->host->i_sb) + __entry->s_dev = + file->f_mapping->host->i_sb->s_dev; + else + __entry->s_dev = + file->f_mapping->host->i_rdev; + __entry->old = old; + __entry->new = file->f_wb_err; + ), + + TP_printk("file=%p dev=%d:%d ino=0x%lx old=0x%x new=0x%x", + __entry->file, MAJOR(__entry->s_dev), + MINOR(__entry->s_dev), __entry->i_ino, __entry->old, + __entry->new) +); #endif /* _TRACE_FILEMAP_H */ /* This part must be outside protection */ diff --git a/lib/Makefile b/lib/Makefile index 7fb6ab7..5a00832 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -38,7 +38,7 @@ obj-y += bcd.o div64.o sort.o parser.o debug_locks.o random32.o \ gcd.o lcm.o list_sort.o uuid.o flex_array.o iov_iter.o clz_ctz.o \ bsearch.o find_bit.o llist.o memweight.o kfifo.o \ percpu-refcount.o percpu_ida.o rhashtable.o reciprocal_div.o \ - once.o refcount.o usercopy.o + once.o refcount.o usercopy.o errseq.o obj-y += string_helpers.o obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o obj-y += hexdump.o diff --git a/lib/errseq.c b/lib/errseq.c new file mode 100644 index 0000000..841fa24 --- /dev/null +++ b/lib/errseq.c @@ -0,0 +1,208 @@ +#include <linux/err.h> +#include <linux/bug.h> +#include <linux/atomic.h> +#include <linux/errseq.h> + +/* + * An errseq_t is a way of recording errors in one place, and allowing any + * number of "subscribers" to tell whether it has changed since a previous + * point where it was sampled. + * + * It's implemented as an unsigned 32-bit value. The low order bits are + * designated to hold an error code (between 0 and -MAX_ERRNO). The upper bits + * are used as a counter. This is done with atomics instead of locking so that + * these functions can be called from any context. + * + * The general idea is for consumers to sample an errseq_t value. That value + * can later be used to tell whether any new errors have occurred since that + * sampling was done. + * + * Note that there is a risk of collisions if new errors are being recorded + * frequently, since we have so few bits to use as a counter. + * + * To mitigate this, one bit is used as a flag to tell whether the value has + * been sampled since a new value was recorded. That allows us to avoid bumping + * the counter if no one has sampled it since the last time an error was + * recorded. + * + * A new errseq_t should always be zeroed out. A errseq_t value of all zeroes + * is the special (but common) case where there has never been an error. An all + * zero value thus serves as the "epoch" if one wishes to know whether there + * has ever been an error set since it was first initialized. + */ + +/* The low bits are designated for error code (max of MAX_ERRNO) */ +#define ERRSEQ_SHIFT ilog2(MAX_ERRNO + 1) + +/* This bit is used as a flag to indicate whether the value has been seen */ +#define ERRSEQ_SEEN (1 << ERRSEQ_SHIFT) + +/* The lowest bit of the counter */ +#define ERRSEQ_CTR_INC (1 << (ERRSEQ_SHIFT + 1)) + +/** + * __errseq_set - set a errseq_t for later reporting + * @eseq: errseq_t field that should be set + * @err: error to set + * + * This function sets the error in *eseq, and increments the sequence counter + * if the last sequence was sampled at some point in the past. + * + * Any error set will always overwrite an existing error. + * + * Most callers will want to use the errseq_set inline wrapper to efficiently + * handle the common case where err is 0. + * + * We do return an errseq_t here, primarily for debugging purposes. The return + * value should not be used as a previously sampled value in later calls as it + * will not have the SEEN flag set. + */ +errseq_t __errseq_set(errseq_t *eseq, int err) +{ + errseq_t cur, old; + + /* MAX_ERRNO must be able to serve as a mask */ + BUILD_BUG_ON_NOT_POWER_OF_2(MAX_ERRNO + 1); + + /* + * Ensure the error code actually fits where we want it to go. If it + * doesn't then just throw a warning and don't record anything. We + * also don't accept zero here as that would effectively clear a + * previous error. + */ + old = READ_ONCE(*eseq); + + if (WARN(unlikely(err == 0 || (unsigned int)-err > MAX_ERRNO), + "err = %d\n", err)) + return old; + + for (;;) { + errseq_t new; + + /* Clear out error bits and set new error */ + new = (old & ~(MAX_ERRNO|ERRSEQ_SEEN)) | -err; + + /* Only increment if someone has looked at it */ + if (old & ERRSEQ_SEEN) + new += ERRSEQ_CTR_INC; + + /* If there would be no change, then call it done */ + if (new == old) { + cur = new; + break; + } + + /* Try to swap the new value into place */ + cur = cmpxchg(eseq, old, new); + + /* + * Call it success if we did the swap or someone else beat us + * to it for the same value. + */ + if (likely(cur == old || cur == new)) + break; + + /* Raced with an update, try again */ + old = cur; + } + return cur; +} +EXPORT_SYMBOL(__errseq_set); + +/** + * errseq_sample - grab current errseq_t value + * @eseq: pointer to errseq_t to be sampled + * + * This function allows callers to sample an errseq_t value, marking it as + * "seen" if required. + */ +errseq_t errseq_sample(errseq_t *eseq) +{ + errseq_t old = READ_ONCE(*eseq); + errseq_t new = old; + + /* + * For the common case of no errors ever having been set, we can skip + * marking the SEEN bit. Once an error has been set, the value will + * never go back to zero. + */ + if (old != 0) { + new |= ERRSEQ_SEEN; + if (old != new) + cmpxchg(eseq, old, new); + } + return new; +} +EXPORT_SYMBOL(errseq_sample); + +/** + * errseq_check - has an error occurred since a particular sample point? + * @eseq: pointer to errseq_t value to be checked + * @since: previously-sampled errseq_t from which to check + * + * Grab the value that eseq points to, and see if it has changed "since" + * the given value was sampled. The "since" value is not advanced, so there + * is no need to mark the value as seen. + * + * Returns the latest error set in the errseq_t or 0 if it hasn't changed. + */ +int errseq_check(errseq_t *eseq, errseq_t since) +{ + errseq_t cur = READ_ONCE(*eseq); + + if (likely(cur == since)) + return 0; + return -(cur & MAX_ERRNO); +} +EXPORT_SYMBOL(errseq_check); + +/** + * errseq_check_and_advance - check an errseq_t and advance to current value + * @eseq: pointer to value being checked and reported + * @since: pointer to previously-sampled errseq_t to check against and advance + * + * Grab the eseq value, and see whether it matches the value that "since" + * points to. If it does, then just return 0. + * + * If it doesn't, then the value has changed. Set the "seen" flag, and try to + * swap it into place as the new eseq value. Then, set that value as the new + * "since" value, and return whatever the error portion is set to. + * + * Note that no locking is provided here for concurrent updates to the "since" + * value. The caller must provide that if necessary. Because of this, callers + * may want to do a lockless errseq_check before taking the lock and calling + * this. + */ +int errseq_check_and_advance(errseq_t *eseq, errseq_t *since) +{ + int err = 0; + errseq_t old, new; + + /* + * Most callers will want to use the inline wrapper to check this, + * so that the common case of no error is handled without needing + * to take the lock that protects the "since" value. + */ + old = READ_ONCE(*eseq); + if (old != *since) { + /* + * Set the flag and try to swap it into place if it has + * changed. + * + * We don't care about the outcome of the swap here. If the + * swap doesn't occur, then it has either been updated by a + * writer who is altering the value in some way (updating + * counter or resetting the error), or another reader who is + * just setting the "seen" flag. Either outcome is OK, and we + * can advance "since" and return an error based on what we + * have. + */ + new = old | ERRSEQ_SEEN; + if (new != old) + cmpxchg(eseq, old, new); + *since = new; + err = -(new & MAX_ERRNO); + } + return err; +} +EXPORT_SYMBOL(errseq_check_and_advance); diff --git a/mm/filemap.c b/mm/filemap.c index 2e906ef..3247b42 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -309,6 +309,16 @@ int filemap_check_errors(struct address_space *mapping) } EXPORT_SYMBOL(filemap_check_errors); +static int filemap_check_and_keep_errors(struct address_space *mapping) +{ + /* Check for outstanding write errors */ + if (test_bit(AS_EIO, &mapping->flags)) + return -EIO; + if (test_bit(AS_ENOSPC, &mapping->flags)) + return -ENOSPC; + return 0; +} + /** * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range * @mapping: address space structure to write @@ -408,17 +418,16 @@ bool filemap_range_has_page(struct address_space *mapping, } EXPORT_SYMBOL(filemap_range_has_page); -static int __filemap_fdatawait_range(struct address_space *mapping, +static void __filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, loff_t end_byte) { pgoff_t index = start_byte >> PAGE_SHIFT; pgoff_t end = end_byte >> PAGE_SHIFT; struct pagevec pvec; int nr_pages; - int ret = 0; if (end_byte < start_byte) - goto out; + return; pagevec_init(&pvec, 0); while ((index <= end) && @@ -435,14 +444,11 @@ static int __filemap_fdatawait_range(struct address_space *mapping, continue; wait_on_page_writeback(page); - if (TestClearPageError(page)) - ret = -EIO; + ClearPageError(page); } pagevec_release(&pvec); cond_resched(); } -out: - return ret; } /** @@ -462,14 +468,8 @@ out: int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, loff_t end_byte) { - int ret, ret2; - - ret = __filemap_fdatawait_range(mapping, start_byte, end_byte); - ret2 = filemap_check_errors(mapping); - if (!ret) - ret = ret2; - - return ret; + __filemap_fdatawait_range(mapping, start_byte, end_byte); + return filemap_check_errors(mapping); } EXPORT_SYMBOL(filemap_fdatawait_range); @@ -485,15 +485,17 @@ EXPORT_SYMBOL(filemap_fdatawait_range); * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2), * fsfreeze(8) */ -void filemap_fdatawait_keep_errors(struct address_space *mapping) +int filemap_fdatawait_keep_errors(struct address_space *mapping) { loff_t i_size = i_size_read(mapping->host); if (i_size == 0) - return; + return 0; __filemap_fdatawait_range(mapping, 0, i_size - 1); + return filemap_check_and_keep_errors(mapping); } +EXPORT_SYMBOL(filemap_fdatawait_keep_errors); /** * filemap_fdatawait - wait for all under-writeback pages to complete @@ -535,6 +537,9 @@ int filemap_write_and_wait(struct address_space *mapping) int err2 = filemap_fdatawait(mapping); if (!err) err = err2; + } else { + /* Clear any previously stored errors */ + filemap_check_errors(mapping); } } else { err = filemap_check_errors(mapping); @@ -569,6 +574,9 @@ int filemap_write_and_wait_range(struct address_space *mapping, lstart, lend); if (!err) err = err2; + } else { + /* Clear any previously stored errors */ + filemap_check_errors(mapping); } } else { err = filemap_check_errors(mapping); @@ -577,6 +585,90 @@ int filemap_write_and_wait_range(struct address_space *mapping, } EXPORT_SYMBOL(filemap_write_and_wait_range); +void __filemap_set_wb_err(struct address_space *mapping, int err) +{ + errseq_t eseq = __errseq_set(&mapping->wb_err, err); + + trace_filemap_set_wb_err(mapping, eseq); +} +EXPORT_SYMBOL(__filemap_set_wb_err); + +/** + * file_check_and_advance_wb_err - report wb error (if any) that was previously + * and advance wb_err to current one + * @file: struct file on which the error is being reported + * + * When userland calls fsync (or something like nfsd does the equivalent), we + * want to report any writeback errors that occurred since the last fsync (or + * since the file was opened if there haven't been any). + * + * Grab the wb_err from the mapping. If it matches what we have in the file, + * then just quickly return 0. The file is all caught up. + * + * If it doesn't match, then take the mapping value, set the "seen" flag in + * it and try to swap it into place. If it works, or another task beat us + * to it with the new value, then update the f_wb_err and return the error + * portion. The error at this point must be reported via proper channels + * (a'la fsync, or NFS COMMIT operation, etc.). + * + * While we handle mapping->wb_err with atomic operations, the f_wb_err + * value is protected by the f_lock since we must ensure that it reflects + * the latest value swapped in for this file descriptor. + */ +int file_check_and_advance_wb_err(struct file *file) +{ + int err = 0; + errseq_t old = READ_ONCE(file->f_wb_err); + struct address_space *mapping = file->f_mapping; + + /* Locklessly handle the common case where nothing has changed */ + if (errseq_check(&mapping->wb_err, old)) { + /* Something changed, must use slow path */ + spin_lock(&file->f_lock); + old = file->f_wb_err; + err = errseq_check_and_advance(&mapping->wb_err, + &file->f_wb_err); + trace_file_check_and_advance_wb_err(file, old); + spin_unlock(&file->f_lock); + } + return err; +} +EXPORT_SYMBOL(file_check_and_advance_wb_err); + +/** + * file_write_and_wait_range - write out & wait on a file range + * @file: file pointing to address_space with pages + * @lstart: offset in bytes where the range starts + * @lend: offset in bytes where the range ends (inclusive) + * + * Write out and wait upon file offsets lstart->lend, inclusive. + * + * Note that @lend is inclusive (describes the last byte to be written) so + * that this function can be used to write to the very end-of-file (end = -1). + * + * After writing out and waiting on the data, we check and advance the + * f_wb_err cursor to the latest value, and return any errors detected there. + */ +int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend) +{ + int err = 0, err2; + struct address_space *mapping = file->f_mapping; + + if ((!dax_mapping(mapping) && mapping->nrpages) || + (dax_mapping(mapping) && mapping->nrexceptional)) { + err = __filemap_fdatawrite_range(mapping, lstart, lend, + WB_SYNC_ALL); + /* See comment of filemap_write_and_wait() */ + if (err != -EIO) + __filemap_fdatawait_range(mapping, lstart, lend); + } + err2 = file_check_and_advance_wb_err(file); + if (!err) + err = err2; + return err; +} +EXPORT_SYMBOL(file_write_and_wait_range); + /** * replace_page_cache_page - replace a pagecache page with a new one * @old: page to be replaced diff --git a/mm/memory-failure.c b/mm/memory-failure.c index a74c831..dbe3e50 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -684,7 +684,7 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn) * the first EIO, but we're not worse than other parts * of the kernel. */ - mapping_set_error(mapping, EIO); + mapping_set_error(mapping, -EIO); } return me_pagecache_clean(p, pfn); |