diff options
49 files changed, 1942 insertions, 714 deletions
diff --git a/Documentation/filesystems/xfs.txt b/Documentation/filesystems/xfs.txt index 8146e9f..c2d44e6 100644 --- a/Documentation/filesystems/xfs.txt +++ b/Documentation/filesystems/xfs.txt @@ -348,3 +348,126 @@ Removed Sysctls ---- ------- fs.xfs.xfsbufd_centisec v4.0 fs.xfs.age_buffer_centisecs v4.0 + + +Error handling +============== + +XFS can act differently according to the type of error found during its +operation. The implementation introduces the following concepts to the error +handler: + + -failure speed: + Defines how fast XFS should propagate an error upwards when a specific + error is found during the filesystem operation. It can propagate + immediately, after a defined number of retries, after a set time period, + or simply retry forever. + + -error classes: + Specifies the subsystem the error configuration will apply to, such as + metadata IO or memory allocation. Different subsystems will have + different error handlers for which behaviour can be configured. + + -error handlers: + Defines the behavior for a specific error. + +The filesystem behavior during an error can be set via sysfs files. Each +error handler works independently - the first condition met by an error handler +for a specific class will cause the error to be propagated rather than reset and +retried. + +The action taken by the filesystem when the error is propagated is context +dependent - it may cause a shut down in the case of an unrecoverable error, +it may be reported back to userspace, or it may even be ignored because +there's nothing useful we can with the error or anyone we can report it to (e.g. +during unmount). + +The configuration files are organized into the following hierarchy for each +mounted filesystem: + + /sys/fs/xfs/<dev>/error/<class>/<error>/ + +Where: + <dev> + The short device name of the mounted filesystem. This is the same device + name that shows up in XFS kernel error messages as "XFS(<dev>): ..." + + <class> + The subsystem the error configuration belongs to. As of 4.9, the defined + classes are: + + - "metadata": applies metadata buffer write IO + + <error> + The individual error handler configurations. + + +Each filesystem has "global" error configuration options defined in their top +level directory: + + /sys/fs/xfs/<dev>/error/ + + fail_at_unmount (Min: 0 Default: 1 Max: 1) + Defines the filesystem error behavior at unmount time. + + If set to a value of 1, XFS will override all other error configurations + during unmount and replace them with "immediate fail" characteristics. + i.e. no retries, no retry timeout. This will always allow unmount to + succeed when there are persistent errors present. + + If set to 0, the configured retry behaviour will continue until all + retries and/or timeouts have been exhausted. This will delay unmount + completion when there are persistent errors, and it may prevent the + filesystem from ever unmounting fully in the case of "retry forever" + handler configurations. + + Note: there is no guarantee that fail_at_unmount can be set whilst an + unmount is in progress. It is possible that the sysfs entries are + removed by the unmounting filesystem before a "retry forever" error + handler configuration causes unmount to hang, and hence the filesystem + must be configured appropriately before unmount begins to prevent + unmount hangs. + +Each filesystem has specific error class handlers that define the error +propagation behaviour for specific errors. There is also a "default" error +handler defined, which defines the behaviour for all errors that don't have +specific handlers defined. Where multiple retry constraints are configuredi for +a single error, the first retry configuration that expires will cause the error +to be propagated. The handler configurations are found in the directory: + + /sys/fs/xfs/<dev>/error/<class>/<error>/ + + max_retries (Min: -1 Default: Varies Max: INTMAX) + Defines the allowed number of retries of a specific error before + the filesystem will propagate the error. The retry count for a given + error context (e.g. a specific metadata buffer) is reset every time + there is a successful completion of the operation. + + Setting the value to "-1" will cause XFS to retry forever for this + specific error. + + Setting the value to "0" will cause XFS to fail immediately when the + specific error is reported. + + Setting the value to "N" (where 0 < N < Max) will make XFS retry the + operation "N" times before propagating the error. + + retry_timeout_seconds (Min: -1 Default: Varies Max: 1 day) + Define the amount of time (in seconds) that the filesystem is + allowed to retry its operations when the specific error is + found. + + Setting the value to "-1" will allow XFS to retry forever for this + specific error. + + Setting the value to "0" will cause XFS to fail immediately when the + specific error is reported. + + Setting the value to "N" (where 0 < N < Max) will allow XFS to retry the + operation for up to "N" seconds before propagating the error. + +Note: The default behaviour for a specific error handler is dependent on both +the class and error context. For example, the default values for +"metadata/ENODEV" are "0" rather than "-1" so that this error handler defaults +to "fail immediately" behaviour. This is done because ENODEV is a fatal, +unrecoverable error no matter how many times the metadata IO is retried. diff --git a/MAINTAINERS b/MAINTAINERS index 841ffa3..ef4f7c41 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13099,11 +13099,10 @@ F: arch/x86/xen/*swiotlb* F: drivers/xen/*swiotlb* XFS FILESYSTEM -P: Silicon Graphics Inc M: Dave Chinner <david@fromorbit.com> -M: xfs@oss.sgi.com -L: xfs@oss.sgi.com -W: http://oss.sgi.com/projects/xfs +M: linux-xfs@vger.kernel.org +L: linux-xfs@vger.kernel.org +W: http://xfs.org/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs.git S: Supported F: Documentation/filesystems/xfs.txt @@ -31,6 +31,8 @@ #include <linux/vmstat.h> #include <linux/pfn_t.h> #include <linux/sizes.h> +#include <linux/iomap.h> +#include "internal.h" /* * We use lowest available bit in exceptional entry for locking, other two @@ -580,14 +582,13 @@ static int dax_load_hole(struct address_space *mapping, void *entry, return VM_FAULT_LOCKED; } -static int copy_user_bh(struct page *to, struct inode *inode, - struct buffer_head *bh, unsigned long vaddr) +static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size, + struct page *to, unsigned long vaddr) { struct blk_dax_ctl dax = { - .sector = to_sector(bh, inode), - .size = bh->b_size, + .sector = sector, + .size = size, }; - struct block_device *bdev = bh->b_bdev; void *vto; if (dax_map_atomic(bdev, &dax) < 0) @@ -790,14 +791,13 @@ int dax_writeback_mapping_range(struct address_space *mapping, EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); static int dax_insert_mapping(struct address_space *mapping, - struct buffer_head *bh, void **entryp, - struct vm_area_struct *vma, struct vm_fault *vmf) + struct block_device *bdev, sector_t sector, size_t size, + void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf) { unsigned long vaddr = (unsigned long)vmf->virtual_address; - struct block_device *bdev = bh->b_bdev; struct blk_dax_ctl dax = { - .sector = to_sector(bh, mapping->host), - .size = bh->b_size, + .sector = sector, + .size = size, }; void *ret; void *entry = *entryp; @@ -868,7 +868,8 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, if (vmf->cow_page) { struct page *new_page = vmf->cow_page; if (buffer_written(&bh)) - error = copy_user_bh(new_page, inode, &bh, vaddr); + error = copy_user_dax(bh.b_bdev, to_sector(&bh, inode), + bh.b_size, new_page, vaddr); else clear_user_highpage(new_page, vaddr); if (error) @@ -898,7 +899,8 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, /* Filesystem should not return unwritten buffers to us! */ WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh)); - error = dax_insert_mapping(mapping, &bh, &entry, vma, vmf); + error = dax_insert_mapping(mapping, bh.b_bdev, to_sector(&bh, inode), + bh.b_size, &entry, vma, vmf); unlock_entry: put_locked_mapping_entry(mapping, vmf->pgoff, entry); out: @@ -1241,3 +1243,229 @@ int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block) return dax_zero_page_range(inode, from, length, get_block); } EXPORT_SYMBOL_GPL(dax_truncate_page); + +#ifdef CONFIG_FS_IOMAP +static loff_t +iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data, + struct iomap *iomap) +{ + struct iov_iter *iter = data; + loff_t end = pos + length, done = 0; + ssize_t ret = 0; + + if (iov_iter_rw(iter) == READ) { + end = min(end, i_size_read(inode)); + if (pos >= end) + return 0; + + if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) + return iov_iter_zero(min(length, end - pos), iter); + } + + if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED)) + return -EIO; + + while (pos < end) { + unsigned offset = pos & (PAGE_SIZE - 1); + struct blk_dax_ctl dax = { 0 }; + ssize_t map_len; + + dax.sector = iomap->blkno + + (((pos & PAGE_MASK) - iomap->offset) >> 9); + dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK; + map_len = dax_map_atomic(iomap->bdev, &dax); + if (map_len < 0) { + ret = map_len; + break; + } + + dax.addr += offset; + map_len -= offset; + if (map_len > end - pos) + map_len = end - pos; + + if (iov_iter_rw(iter) == WRITE) + map_len = copy_from_iter_pmem(dax.addr, map_len, iter); + else + map_len = copy_to_iter(dax.addr, map_len, iter); + dax_unmap_atomic(iomap->bdev, &dax); + if (map_len <= 0) { + ret = map_len ? map_len : -EFAULT; + break; + } + + pos += map_len; + length -= map_len; + done += map_len; + } + + return done ? done : ret; +} + +/** + * iomap_dax_rw - Perform I/O to a DAX file + * @iocb: The control block for this I/O + * @iter: The addresses to do I/O from or to + * @ops: iomap ops passed from the file system + * + * This function performs read and write operations to directly mapped + * persistent memory. The callers needs to take care of read/write exclusion + * and evicting any page cache pages in the region under I/O. + */ +ssize_t +iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter, + struct iomap_ops *ops) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + struct inode *inode = mapping->host; + loff_t pos = iocb->ki_pos, ret = 0, done = 0; + unsigned flags = 0; + + if (iov_iter_rw(iter) == WRITE) + flags |= IOMAP_WRITE; + + /* + * Yes, even DAX files can have page cache attached to them: A zeroed + * page is inserted into the pagecache when we have to serve a write + * fault on a hole. It should never be dirtied and can simply be + * dropped from the pagecache once we get real data for the page. + * + * XXX: This is racy against mmap, and there's nothing we can do about + * it. We'll eventually need to shift this down even further so that + * we can check if we allocated blocks over a hole first. + */ + if (mapping->nrpages) { + ret = invalidate_inode_pages2_range(mapping, + pos >> PAGE_SHIFT, + (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT); + WARN_ON_ONCE(ret); + } + + while (iov_iter_count(iter)) { + ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops, + iter, iomap_dax_actor); + if (ret <= 0) + break; + pos += ret; + done += ret; + } + + iocb->ki_pos += done; + return done ? done : ret; +} +EXPORT_SYMBOL_GPL(iomap_dax_rw); + +/** + * iomap_dax_fault - handle a page fault on a DAX file + * @vma: The virtual memory area where the fault occurred + * @vmf: The description of the fault + * @ops: iomap ops passed from the file system + * + * When a page fault occurs, filesystems may call this helper in their fault + * or mkwrite handler for DAX files. Assumes the caller has done all the + * necessary locking for the page fault to proceed successfully. + */ +int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, + struct iomap_ops *ops) +{ + struct address_space *mapping = vma->vm_file->f_mapping; + struct inode *inode = mapping->host; + unsigned long vaddr = (unsigned long)vmf->virtual_address; + loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; + sector_t sector; + struct iomap iomap = { 0 }; + unsigned flags = 0; + int error, major = 0; + void *entry; + + /* + * Check whether offset isn't beyond end of file now. Caller is supposed + * to hold locks serializing us with truncate / punch hole so this is + * a reliable test. + */ + if (pos >= i_size_read(inode)) + return VM_FAULT_SIGBUS; + + entry = grab_mapping_entry(mapping, vmf->pgoff); + if (IS_ERR(entry)) { + error = PTR_ERR(entry); + goto out; + } + + if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page) + flags |= IOMAP_WRITE; + + /* + * Note that we don't bother to use iomap_apply here: DAX required + * the file system block size to be equal the page size, which means + * that we never have to deal with more than a single extent here. + */ + error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap); + if (error) + goto unlock_entry; + if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) { + error = -EIO; /* fs corruption? */ + goto unlock_entry; + } + + sector = iomap.blkno + (((pos & PAGE_MASK) - iomap.offset) >> 9); + + if (vmf->cow_page) { + switch (iomap.type) { + case IOMAP_HOLE: + case IOMAP_UNWRITTEN: + clear_user_highpage(vmf->cow_page, vaddr); + break; + case IOMAP_MAPPED: + error = copy_user_dax(iomap.bdev, sector, PAGE_SIZE, + vmf->cow_page, vaddr); + break; + default: + WARN_ON_ONCE(1); + error = -EIO; + break; + } + + if (error) + goto unlock_entry; + if (!radix_tree_exceptional_entry(entry)) { + vmf->page = entry; + return VM_FAULT_LOCKED; + } + vmf->entry = entry; + return VM_FAULT_DAX_LOCKED; + } + + switch (iomap.type) { + case IOMAP_MAPPED: + if (iomap.flags & IOMAP_F_NEW) { + count_vm_event(PGMAJFAULT); + mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); + major = VM_FAULT_MAJOR; + } + error = dax_insert_mapping(mapping, iomap.bdev, sector, + PAGE_SIZE, &entry, vma, vmf); + break; + case IOMAP_UNWRITTEN: + case IOMAP_HOLE: + if (!(vmf->flags & FAULT_FLAG_WRITE)) + return dax_load_hole(mapping, entry, vmf); + /*FALLTHRU*/ + default: + WARN_ON_ONCE(1); + error = -EIO; + break; + } + + unlock_entry: + put_locked_mapping_entry(mapping, vmf->pgoff, entry); + out: + if (error == -ENOMEM) + return VM_FAULT_OOM | major; + /* -EBUSY is fine, somebody else faulted on the same PTE */ + if (error < 0 && error != -EBUSY) + return VM_FAULT_SIGBUS | major; + return VM_FAULT_NOPAGE | major; +} +EXPORT_SYMBOL_GPL(iomap_dax_fault); +#endif /* CONFIG_FS_IOMAP */ diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig index c634874e..36bea5a 100644 --- a/fs/ext2/Kconfig +++ b/fs/ext2/Kconfig @@ -1,5 +1,6 @@ config EXT2_FS tristate "Second extended fs support" + select FS_IOMAP if FS_DAX help Ext2 is a standard Linux file system for hard disks. diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 06af2f9..37e2be7 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -814,6 +814,7 @@ extern const struct file_operations ext2_file_operations; /* inode.c */ extern const struct address_space_operations ext2_aops; extern const struct address_space_operations ext2_nobh_aops; +extern struct iomap_ops ext2_iomap_ops; /* namei.c */ extern const struct inode_operations ext2_dir_inode_operations; diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 5efeefe..423cc01 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -22,11 +22,59 @@ #include <linux/pagemap.h> #include <linux/dax.h> #include <linux/quotaops.h> +#include <linux/iomap.h> +#include <linux/uio.h> #include "ext2.h" #include "xattr.h" #include "acl.h" #ifdef CONFIG_FS_DAX +static ssize_t ext2_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct inode *inode = iocb->ki_filp->f_mapping->host; + ssize_t ret; + + if (!iov_iter_count(to)) + return 0; /* skip atime */ + + inode_lock_shared(inode); + ret = iomap_dax_rw(iocb, to, &ext2_iomap_ops); + inode_unlock_shared(inode); + + file_accessed(iocb->ki_filp); + return ret; +} + +static ssize_t ext2_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + ssize_t ret; + + inode_lock(inode); + ret = generic_write_checks(iocb, from); + if (ret <= 0) + goto out_unlock; + ret = file_remove_privs(file); + if (ret) + goto out_unlock; + ret = file_update_time(file); + if (ret) + goto out_unlock; + + ret = iomap_dax_rw(iocb, from, &ext2_iomap_ops); + if (ret > 0 && iocb->ki_pos > i_size_read(inode)) { + i_size_write(inode, iocb->ki_pos); + mark_inode_dirty(inode); + } + +out_unlock: + inode_unlock(inode); + if (ret > 0) + ret = generic_write_sync(iocb, ret); + return ret; +} + /* * The lock ordering for ext2 DAX fault paths is: * @@ -51,7 +99,7 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } down_read(&ei->dax_sem); - ret = dax_fault(vma, vmf, ext2_get_block); + ret = iomap_dax_fault(vma, vmf, &ext2_iomap_ops); up_read(&ei->dax_sem); if (vmf->flags & FAULT_FLAG_WRITE) @@ -156,14 +204,28 @@ int ext2_fsync(struct file *file, loff_t start, loff_t end, int datasync) return ret; } -/* - * We have mostly NULL's here: the current defaults are ok for - * the ext2 filesystem. - */ +static ssize_t ext2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ +#ifdef CONFIG_FS_DAX + if (IS_DAX(iocb->ki_filp->f_mapping->host)) + return ext2_dax_read_iter(iocb, to); +#endif + return generic_file_read_iter(iocb, to); +} + +static ssize_t ext2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ +#ifdef CONFIG_FS_DAX + if (IS_DAX(iocb->ki_filp->f_mapping->host)) + return ext2_dax_write_iter(iocb, from); +#endif + return generic_file_write_iter(iocb, from); +} + const struct file_operations ext2_file_operations = { .llseek = generic_file_llseek, - .read_iter = generic_file_read_iter, - .write_iter = generic_file_write_iter, + .read_iter = ext2_file_read_iter, + .write_iter = ext2_file_write_iter, .unlocked_ioctl = ext2_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext2_compat_ioctl, diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 303ae2b..1e72d42 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -32,6 +32,7 @@ #include <linux/buffer_head.h> #include <linux/mpage.h> #include <linux/fiemap.h> +#include <linux/iomap.h> #include <linux/namei.h> #include <linux/uio.h> #include "ext2.h" @@ -618,7 +619,7 @@ static void ext2_splice_branch(struct inode *inode, */ static int ext2_get_blocks(struct inode *inode, sector_t iblock, unsigned long maxblocks, - struct buffer_head *bh_result, + u32 *bno, bool *new, bool *boundary, int create) { int err = -EIO; @@ -644,7 +645,6 @@ static int ext2_get_blocks(struct inode *inode, /* Simplest case - block found, no allocation needed */ if (!partial) { first_block = le32_to_cpu(chain[depth - 1].key); - clear_buffer_new(bh_result); /* What's this do? */ count++; /*map more blocks*/ while (count < maxblocks && count <= blocks_to_boundary) { @@ -699,7 +699,6 @@ static int ext2_get_blocks(struct inode *inode, mutex_unlock(&ei->truncate_mutex); if (err) goto cleanup; - clear_buffer_new(bh_result); goto got_it; } } @@ -755,15 +754,16 @@ static int ext2_get_blocks(struct inode *inode, mutex_unlock(&ei->truncate_mutex); goto cleanup; } - } else - set_buffer_new(bh_result); + } else { + *new = true; + } ext2_splice_branch(inode, iblock, partial, indirect_blks, count); mutex_unlock(&ei->truncate_mutex); got_it: - map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); + *bno = le32_to_cpu(chain[depth-1].key); if (count > blocks_to_boundary) - set_buffer_boundary(bh_result); + *boundary = true; err = count; /* Clean up and exit */ partial = chain + depth - 1; /* the whole chain */ @@ -775,19 +775,82 @@ cleanup: return err; } -int ext2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) +int ext2_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) { unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; - int ret = ext2_get_blocks(inode, iblock, max_blocks, - bh_result, create); - if (ret > 0) { - bh_result->b_size = (ret << inode->i_blkbits); - ret = 0; + bool new = false, boundary = false; + u32 bno; + int ret; + + ret = ext2_get_blocks(inode, iblock, max_blocks, &bno, &new, &boundary, + create); + if (ret <= 0) + return ret; + + map_bh(bh_result, inode->i_sb, bno); + bh_result->b_size = (ret << inode->i_blkbits); + if (new) + set_buffer_new(bh_result); + if (boundary) + set_buffer_boundary(bh_result); + return 0; + +} + +#ifdef CONFIG_FS_DAX +static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length, + unsigned flags, struct iomap *iomap) +{ + unsigned int blkbits = inode->i_blkbits; + unsigned long first_block = offset >> blkbits; + unsigned long max_blocks = (length + (1 << blkbits) - 1) >> blkbits; + bool new = false, boundary = false; + u32 bno; + int ret; + + ret = ext2_get_blocks(inode, first_block, max_blocks, + &bno, &new, &boundary, flags & IOMAP_WRITE); + if (ret < 0) + return ret; + + iomap->flags = 0; + iomap->bdev = inode->i_sb->s_bdev; + iomap->offset = (u64)first_block << blkbits; + + if (ret == 0) { + iomap->type = IOMAP_HOLE; + iomap->blkno = IOMAP_NULL_BLOCK; + iomap->length = 1 << blkbits; + } else { + iomap->type = IOMAP_MAPPED; + iomap->blkno = (sector_t)bno << (blkbits - 9); + iomap->length = (u64)ret << blkbits; + iomap->flags |= IOMAP_F_MERGED; } - return ret; + if (new) + iomap->flags |= IOMAP_F_NEW; + return 0; } +static int +ext2_iomap_end(struct inode *inode, loff_t offset, loff_t length, + ssize_t written, unsigned flags, struct iomap *iomap) +{ + if (iomap->type == IOMAP_MAPPED && + written < length && + (flags & IOMAP_WRITE)) + ext2_write_failed(inode->i_mapping, offset + length); + return 0; +} + +struct iomap_ops ext2_iomap_ops = { + .iomap_begin = ext2_iomap_begin, + .iomap_end = ext2_iomap_end, +}; +#endif /* CONFIG_FS_DAX */ + int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { @@ -873,11 +936,10 @@ ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter) loff_t offset = iocb->ki_pos; ssize_t ret; - if (IS_DAX(inode)) - ret = dax_do_io(iocb, inode, iter, ext2_get_block, NULL, - DIO_LOCKING); - else - ret = blockdev_direct_IO(iocb, inode, iter, ext2_get_block); + if (WARN_ON_ONCE(IS_DAX(inode))) + return -EIO; + + ret = blockdev_direct_IO(iocb, inode, iter, ext2_get_block); if (ret < 0 && iov_iter_rw(iter) == WRITE) ext2_write_failed(mapping, offset + count); return ret; diff --git a/fs/internal.h b/fs/internal.h index ba07376..8591786 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -12,6 +12,7 @@ struct super_block; struct file_system_type; struct iomap; +struct iomap_ops; struct linux_binprm; struct path; struct mount; @@ -164,3 +165,13 @@ extern struct dentry_operations ns_dentry_operations; extern int do_vfs_ioctl(struct file *file, unsigned int fd, unsigned int cmd, unsigned long arg); extern long vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); + +/* + * iomap support: + */ +typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len, + void *data, struct iomap *iomap); + +loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length, + unsigned flags, struct iomap_ops *ops, void *data, + iomap_actor_t actor); @@ -27,9 +27,6 @@ #include <linux/dax.h> #include "internal.h" -typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len, - void *data, struct iomap *iomap); - /* * Execute a iomap write on a segment of the mapping that spans a * contiguous range of pages that have identical block mapping state. @@ -41,7 +38,7 @@ typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len, * resources they require in the iomap_begin call, and release them in the * iomap_end call. */ -static loff_t +loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, struct iomap_ops *ops, void *data, iomap_actor_t actor) { @@ -252,6 +249,88 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter, } EXPORT_SYMBOL_GPL(iomap_file_buffered_write); +static struct page * +__iomap_read_page(struct inode *inode, loff_t offset) +{ + struct address_space *mapping = inode->i_mapping; + struct page *page; + + page = read_mapping_page(mapping, offset >> PAGE_SHIFT, NULL); + if (IS_ERR(page)) + return page; + if (!PageUptodate(page)) { + put_page(page); + return ERR_PTR(-EIO); + } + return page; +} + +static loff_t +iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data, + struct iomap *iomap) +{ + long status = 0; + ssize_t written = 0; + + do { + struct page *page, *rpage; + unsigned long offset; /* Offset into pagecache page */ + unsigned long bytes; /* Bytes to write to page */ + + offset = (pos & (PAGE_SIZE - 1)); + bytes = min_t(unsigned long, PAGE_SIZE - offset, length); + + rpage = __iomap_read_page(inode, pos); + if (IS_ERR(rpage)) + return PTR_ERR(rpage); + + status = iomap_write_begin(inode, pos, bytes, + AOP_FLAG_NOFS | AOP_FLAG_UNINTERRUPTIBLE, + &page, iomap); + put_page(rpage); + if (unlikely(status)) + return status; + + WARN_ON_ONCE(!PageUptodate(page)); + + status = iomap_write_end(inode, pos, bytes, bytes, page); + if (unlikely(status <= 0)) { + if (WARN_ON_ONCE(status == 0)) + return -EIO; + return status; + } + + cond_resched(); + + pos += status; + written += status; + length -= status; + + balance_dirty_pages_ratelimited(inode->i_mapping); + } while (length); + + return written; +} + +int +iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len, + struct iomap_ops *ops) +{ + loff_t ret; + + while (len) { + ret = iomap_apply(inode, pos, len, IOMAP_WRITE, ops, NULL, + iomap_dirty_actor); + if (ret <= 0) + return ret; + pos += ret; + len -= ret; + } + + return 0; +} +EXPORT_SYMBOL_GPL(iomap_file_dirty); + static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset, unsigned bytes, struct iomap *iomap) { @@ -430,6 +509,8 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi, if (iomap->flags & IOMAP_F_MERGED) flags |= FIEMAP_EXTENT_MERGED; + if (iomap->flags & IOMAP_F_SHARED) + flags |= FIEMAP_EXTENT_SHARED; return fiemap_fill_next_extent(fi, iomap->offset, iomap->blkno != IOMAP_NULL_BLOCK ? iomap->blkno << 9: 0, diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index fc593c8..584e87e 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -52,6 +52,7 @@ xfs-y += $(addprefix libxfs/, \ xfs_inode_fork.o \ xfs_inode_buf.o \ xfs_log_rlimit.o \ + xfs_ag_resv.o \ xfs_rmap.o \ xfs_rmap_btree.o \ xfs_sb.o \ diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c new file mode 100644 index 0000000..e3ae0f2 --- /dev/null +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -0,0 +1,325 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_alloc.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_trans.h" +#include "xfs_bit.h" +#include "xfs_bmap.h" +#include "xfs_bmap_btree.h" +#include "xfs_ag_resv.h" +#include "xfs_trans_space.h" +#include "xfs_rmap_btree.h" +#include "xfs_btree.h" + +/* + * Per-AG Block Reservations + * + * For some kinds of allocation group metadata structures, it is advantageous + * to reserve a small number of blocks in each AG so that future expansions of + * that data structure do not encounter ENOSPC because errors during a btree + * split cause the filesystem to go offline. + * + * Prior to the introduction of reflink, this wasn't an issue because the free + * space btrees maintain a reserve of space (the AGFL) to handle any expansion + * that may be necessary; and allocations of other metadata (inodes, BMBT, + * dir/attr) aren't restricted to a single AG. However, with reflink it is + * possible to allocate all the space in an AG, have subsequent reflink/CoW + * activity expand the refcount btree, and discover that there's no space left + * to handle that expansion. Since we can calculate the maximum size of the + * refcount btree, we can reserve space for it and avoid ENOSPC. + * + * Handling per-AG reservations consists of three changes to the allocator's + * behavior: First, because these reservations are always needed, we decrease + * the ag_max_usable counter to reflect the size of the AG after the reserved + * blocks are taken. Second, the reservations must be reflected in the + * fdblocks count to maintain proper accounting. Third, each AG must maintain + * its own reserved block counter so that we can calculate the amount of space + * that must remain free to maintain the reservations. Fourth, the "remaining + * reserved blocks" count must be used when calculating the length of the + * longest free extent in an AG and to clamp maxlen in the per-AG allocation + * functions. In other words, we maintain a virtual allocation via in-core + * accounting tricks so that we don't have to clean up after a crash. :) + * + * Reserved blocks can be managed by passing one of the enum xfs_ag_resv_type + * values via struct xfs_alloc_arg or directly to the xfs_free_extent + * function. It might seem a little funny to maintain a reservoir of blocks + * to feed another reservoir, but the AGFL only holds enough blocks to get + * through the next transaction. The per-AG reservation is to ensure (we + * hope) that each AG never runs out of blocks. Each data structure wanting + * to use the reservation system should update ask/used in xfs_ag_resv_init. + */ + +/* + * Are we critically low on blocks? For now we'll define that as the number + * of blocks we can get our hands on being less than 10% of what we reserved + * or less than some arbitrary number (maximum btree height). + */ +bool +xfs_ag_resv_critical( + struct xfs_perag *pag, + enum xfs_ag_resv_type type) +{ + xfs_extlen_t avail; + xfs_extlen_t orig; + + switch (type) { + case XFS_AG_RESV_METADATA: + avail = pag->pagf_freeblks - pag->pag_agfl_resv.ar_reserved; + orig = pag->pag_meta_resv.ar_asked; + break; + case XFS_AG_RESV_AGFL: + avail = pag->pagf_freeblks + pag->pagf_flcount - + pag->pag_meta_resv.ar_reserved; + orig = pag->pag_agfl_resv.ar_asked; + break; + default: + ASSERT(0); + return false; + } + + trace_xfs_ag_resv_critical(pag, type, avail); + + /* Critically low if less than 10% or max btree height remains. */ + return avail < orig / 10 || avail < XFS_BTREE_MAXLEVELS; +} + +/* + * How many blocks are reserved but not used, and therefore must not be + * allocated away? + */ +xfs_extlen_t +xfs_ag_resv_needed( + struct xfs_perag *pag, + enum xfs_ag_resv_type type) +{ + xfs_extlen_t len; + + len = pag->pag_meta_resv.ar_reserved + pag->pag_agfl_resv.ar_reserved; + switch (type) { + case XFS_AG_RESV_METADATA: + case XFS_AG_RESV_AGFL: + len -= xfs_perag_resv(pag, type)->ar_reserved; + break; + case XFS_AG_RESV_NONE: + /* empty */ + break; + default: + ASSERT(0); + } + + trace_xfs_ag_resv_needed(pag, type, len); + + return len; +} + +/* Clean out a reservation */ +static int +__xfs_ag_resv_free( + struct xfs_perag *pag, + enum xfs_ag_resv_type type) +{ + struct xfs_ag_resv *resv; + xfs_extlen_t oldresv; + int error; + + trace_xfs_ag_resv_free(pag, type, 0); + + resv = xfs_perag_resv(pag, type); + pag->pag_mount->m_ag_max_usable += resv->ar_asked; + /* + * AGFL blocks are always considered "free", so whatever + * was reserved at mount time must be given back at umount. + */ + if (type == XFS_AG_RESV_AGFL) + oldresv = resv->ar_orig_reserved; + else + oldresv = resv->ar_reserved; + error = xfs_mod_fdblocks(pag->pag_mount, oldresv, true); + resv->ar_reserved = 0; + resv->ar_asked = 0; + + if (error) + trace_xfs_ag_resv_free_error(pag->pag_mount, pag->pag_agno, + error, _RET_IP_); + return error; +} + +/* Free a per-AG reservation. */ +int +xfs_ag_resv_free( + struct xfs_perag *pag) +{ + int error; + int err2; + + error = __xfs_ag_resv_free(pag, XFS_AG_RESV_AGFL); + err2 = __xfs_ag_resv_free(pag, XFS_AG_RESV_METADATA); + if (err2 && !error) + error = err2; + return error; +} + +static int +__xfs_ag_resv_init( + struct xfs_perag *pag, + enum xfs_ag_resv_type type, + xfs_extlen_t ask, + xfs_extlen_t used) +{ + struct xfs_mount *mp = pag->pag_mount; + struct xfs_ag_resv *resv; + int error; + + resv = xfs_perag_resv(pag, type); + if (used > ask) + ask = used; + resv->ar_asked = ask; + resv->ar_reserved = resv->ar_orig_reserved = ask - used; + mp->m_ag_max_usable -= ask; + + trace_xfs_ag_resv_init(pag, type, ask); + + error = xfs_mod_fdblocks(mp, -(int64_t)resv->ar_reserved, true); + if (error) + trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno, + error, _RET_IP_); + + return error; +} + +/* Create a per-AG block reservation. */ +int +xfs_ag_resv_init( + struct xfs_perag *pag) +{ + xfs_extlen_t ask; + xfs_extlen_t used; + int error = 0; + + /* Create the metadata reservation. */ + if (pag->pag_meta_resv.ar_asked == 0) { + ask = used = 0; + + error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA, + ask, used); + if (error) + goto out; + } + + /* Create the AGFL metadata reservation */ + if (pag->pag_agfl_resv.ar_asked == 0) { + ask = used = 0; + + error = __xfs_ag_resv_init(pag, XFS_AG_RESV_AGFL, ask, used); + if (error) + goto out; + } + +out: + return error; +} + +/* Allocate a block from the reservation. */ +void +xfs_ag_resv_alloc_extent( + struct xfs_perag *pag, + enum xfs_ag_resv_type type, + struct xfs_alloc_arg *args) +{ + struct xfs_ag_resv *resv; + xfs_extlen_t len; + uint field; + + trace_xfs_ag_resv_alloc_extent(pag, type, args->len); + + switch (type) { + case XFS_AG_RESV_METADATA: + case XFS_AG_RESV_AGFL: + resv = xfs_perag_resv(pag, type); + break; + default: + ASSERT(0); + /* fall through */ + case XFS_AG_RESV_NONE: + field = args->wasdel ? XFS_TRANS_SB_RES_FDBLOCKS : + XFS_TRANS_SB_FDBLOCKS; + xfs_trans_mod_sb(args->tp, field, -(int64_t)args->len); + return; + } + + len = min_t(xfs_extlen_t, args->len, resv->ar_reserved); + resv->ar_reserved -= len; + if (type == XFS_AG_RESV_AGFL) + return; + /* Allocations of reserved blocks only need on-disk sb updates... */ + xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_RES_FDBLOCKS, -(int64_t)len); + /* ...but non-reserved blocks need in-core and on-disk updates. */ + if (args->len > len) + xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_FDBLOCKS, + -((int64_t)args->len - len)); +} + +/* Free a block to the reservation. */ +void +xfs_ag_resv_free_extent( + struct xfs_perag *pag, + enum xfs_ag_resv_type type, + struct xfs_trans *tp, + xfs_extlen_t len) +{ + xfs_extlen_t leftover; + struct xfs_ag_resv *resv; + + trace_xfs_ag_resv_free_extent(pag, type, len); + + switch (type) { + case XFS_AG_RESV_METADATA: + case XFS_AG_RESV_AGFL: + resv = xfs_perag_resv(pag, type); + break; + default: + ASSERT(0); + /* fall through */ + case XFS_AG_RESV_NONE: + xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (int64_t)len); + return; + } + + leftover = min_t(xfs_extlen_t, len, resv->ar_asked - resv->ar_reserved); + resv->ar_reserved += leftover; + if (type == XFS_AG_RESV_AGFL) + return; + /* Freeing into the reserved pool only requires on-disk update... */ + xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FDBLOCKS, len); + /* ...but freeing beyond that requires in-core and on-disk update. */ + if (len > leftover) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, len - leftover); +} diff --git a/fs/xfs/libxfs/xfs_ag_resv.h b/fs/xfs/libxfs/xfs_ag_resv.h new file mode 100644 index 0000000..8d6c687 --- /dev/null +++ b/fs/xfs/libxfs/xfs_ag_resv.h @@ -0,0 +1,35 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#ifndef __XFS_AG_RESV_H__ +#define __XFS_AG_RESV_H__ + +int xfs_ag_resv_free(struct xfs_perag *pag); +int xfs_ag_resv_init(struct xfs_perag *pag); + +bool xfs_ag_resv_critical(struct xfs_perag *pag, enum xfs_ag_resv_type type); +xfs_extlen_t xfs_ag_resv_needed(struct xfs_perag *pag, + enum xfs_ag_resv_type type); + +void xfs_ag_resv_alloc_extent(struct xfs_perag *pag, enum xfs_ag_resv_type type, + struct xfs_alloc_arg *args); +void xfs_ag_resv_free_extent(struct xfs_perag *pag, enum xfs_ag_resv_type type, + struct xfs_trans *tp, xfs_extlen_t len); + +#endif /* __XFS_AG_RESV_H__ */ diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 05b5243..ca75dc9 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -37,6 +37,7 @@ #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_log.h" +#include "xfs_ag_resv.h" struct workqueue_struct *xfs_alloc_wq; @@ -74,14 +75,8 @@ xfs_prealloc_blocks( * extents need to be actually allocated. To get around this, we explicitly set * aside a few blocks which will not be reserved in delayed allocation. * - * When rmap is disabled, we need to reserve 4 fsbs _per AG_ for the freelist - * and 4 more to handle a potential split of the file's bmap btree. - * - * When rmap is enabled, we must also be able to handle two rmap btree inserts - * to record both the file data extent and a new bmbt block. The bmbt block - * might not be in the same AG as the file data extent. In the worst case - * the bmap btree splits multiple levels and all the new blocks come from - * different AGs, so set aside enough to handle rmap btree splits in all AGs. + * We need to reserve 4 fsbs _per AG_ for the freelist and 4 more to handle a + * potential split of the file's bmap btree. */ unsigned int xfs_alloc_set_aside( @@ -90,8 +85,6 @@ xfs_alloc_set_aside( unsigned int blocks; blocks = 4 + (mp->m_sb.sb_agcount * XFS_ALLOC_AGFL_RESERVE); - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) - blocks += mp->m_sb.sb_agcount * mp->m_rmap_maxlevels; return blocks; } @@ -265,7 +258,7 @@ xfs_alloc_compute_diff( xfs_agblock_t wantbno, /* target starting block */ xfs_extlen_t wantlen, /* target length */ xfs_extlen_t alignment, /* target alignment */ - char userdata, /* are we allocating data? */ + int datatype, /* are we allocating data? */ xfs_agblock_t freebno, /* freespace's starting block */ xfs_extlen_t freelen, /* freespace's length */ xfs_agblock_t *newbnop) /* result: best start block from free */ @@ -276,6 +269,7 @@ xfs_alloc_compute_diff( xfs_extlen_t newlen1=0; /* length with newbno1 */ xfs_extlen_t newlen2=0; /* length with newbno2 */ xfs_agblock_t wantend; /* end of target extent */ + bool userdata = xfs_alloc_is_userdata(datatype); ASSERT(freelen >= wantlen); freeend = freebno + freelen; @@ -680,12 +674,29 @@ xfs_alloc_ag_vextent( xfs_alloc_arg_t *args) /* argument structure for allocation */ { int error=0; + xfs_extlen_t reservation; + xfs_extlen_t oldmax; ASSERT(args->minlen > 0); ASSERT(args->maxlen > 0); ASSERT(args->minlen <= args->maxlen); ASSERT(args->mod < args->prod); ASSERT(args->alignment > 0); + + /* + * Clamp maxlen to the amount of free space minus any reservations + * that have been made. + */ + oldmax = args->maxlen; + reservation = xfs_ag_resv_needed(args->pag, args->resv); + if (args->maxlen > args->pag->pagf_freeblks - reservation) + args->maxlen = args->pag->pagf_freeblks - reservation; + if (args->maxlen == 0) { + args->agbno = NULLAGBLOCK; + args->maxlen = oldmax; + return 0; + } + /* * Branch to correct routine based on the type. */ @@ -705,12 +716,14 @@ xfs_alloc_ag_vextent( /* NOTREACHED */ } + args->maxlen = oldmax; + if (error || args->agbno == NULLAGBLOCK) return error; ASSERT(args->len >= args->minlen); ASSERT(args->len <= args->maxlen); - ASSERT(!args->wasfromfl || !args->isfl); + ASSERT(!args->wasfromfl || args->resv != XFS_AG_RESV_AGFL); ASSERT(args->agbno % args->alignment == 0); /* if not file data, insert new block into the reverse map btree */ @@ -732,12 +745,7 @@ xfs_alloc_ag_vextent( args->agbno, args->len)); } - if (!args->isfl) { - xfs_trans_mod_sb(args->tp, args->wasdel ? - XFS_TRANS_SB_RES_FDBLOCKS : - XFS_TRANS_SB_FDBLOCKS, - -((long)(args->len))); - } + xfs_ag_resv_alloc_extent(args->pag, args->resv, args); XFS_STATS_INC(args->mp, xs_allocx); XFS_STATS_ADD(args->mp, xs_allocb, args->len); @@ -917,7 +925,7 @@ xfs_alloc_find_best_extent( sdiff = xfs_alloc_compute_diff(args->agbno, args->len, args->alignment, - args->userdata, *sbnoa, + args->datatype, *sbnoa, *slena, &new); /* @@ -1101,7 +1109,7 @@ restart: if (args->len < blen) continue; ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, - args->alignment, args->userdata, ltbnoa, + args->alignment, args->datatype, ltbnoa, ltlena, <new); if (ltnew != NULLAGBLOCK && (args->len > blen || ltdiff < bdiff)) { @@ -1254,7 +1262,7 @@ restart: args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); xfs_alloc_fix_len(args); ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, - args->alignment, args->userdata, ltbnoa, + args->alignment, args->datatype, ltbnoa, ltlena, <new); error = xfs_alloc_find_best_extent(args, @@ -1271,7 +1279,7 @@ restart: args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen); xfs_alloc_fix_len(args); gtdiff = xfs_alloc_compute_diff(args->agbno, args->len, - args->alignment, args->userdata, gtbnoa, + args->alignment, args->datatype, gtbnoa, gtlena, >new); error = xfs_alloc_find_best_extent(args, @@ -1331,7 +1339,7 @@ restart: } rlen = args->len; (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, - args->userdata, ltbnoa, ltlena, <new); + args->datatype, ltbnoa, ltlena, <new); ASSERT(ltnew >= ltbno); ASSERT(ltnew + rlen <= ltbnoa + ltlena); ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); @@ -1583,6 +1591,7 @@ xfs_alloc_ag_vextent_small( int *stat) /* status: 0-freelist, 1-normal/none */ { struct xfs_owner_info oinfo; + struct xfs_perag *pag; int error; xfs_agblock_t fbno; xfs_extlen_t flen; @@ -1600,7 +1609,8 @@ xfs_alloc_ag_vextent_small( * to respect minleft even when pulling from the * freelist. */ - else if (args->minlen == 1 && args->alignment == 1 && !args->isfl && + else if (args->minlen == 1 && args->alignment == 1 && + args->resv != XFS_AG_RESV_AGFL && (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount) > args->minleft)) { error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0); @@ -1608,9 +1618,9 @@ xfs_alloc_ag_vextent_small( goto error0; if (fbno != NULLAGBLOCK) { xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1, - args->userdata); + xfs_alloc_allow_busy_reuse(args->datatype)); - if (args->userdata) { + if (xfs_alloc_is_userdata(args->datatype)) { xfs_buf_t *bp; bp = xfs_btree_get_bufs(args->mp, args->tp, @@ -1629,13 +1639,18 @@ xfs_alloc_ag_vextent_small( /* * If we're feeding an AGFL block to something that * doesn't live in the free space, we need to clear - * out the OWN_AG rmap. + * out the OWN_AG rmap and add the block back to + * the AGFL per-AG reservation. */ xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG); error = xfs_rmap_free(args->tp, args->agbp, args->agno, fbno, 1, &oinfo); if (error) goto error0; + pag = xfs_perag_get(args->mp, args->agno); + xfs_ag_resv_free_extent(pag, XFS_AG_RESV_AGFL, + args->tp, 1); + xfs_perag_put(pag); *stat = 0; return 0; @@ -1683,7 +1698,7 @@ xfs_free_ag_extent( xfs_agblock_t bno, xfs_extlen_t len, struct xfs_owner_info *oinfo, - int isfl) + enum xfs_ag_resv_type type) { xfs_btree_cur_t *bno_cur; /* cursor for by-block btree */ xfs_btree_cur_t *cnt_cur; /* cursor for by-size btree */ @@ -1911,21 +1926,22 @@ xfs_free_ag_extent( */ pag = xfs_perag_get(mp, agno); error = xfs_alloc_update_counters(tp, pag, agbp, len); + xfs_ag_resv_free_extent(pag, type, tp, len); xfs_perag_put(pag); if (error) goto error0; - if (!isfl) - xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len); XFS_STATS_INC(mp, xs_freex); XFS_STATS_ADD(mp, xs_freeb, len); - trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright); + trace_xfs_free_extent(mp, agno, bno, len, type == XFS_AG_RESV_AGFL, + haveleft, haveright); return 0; error0: - trace_xfs_free_extent(mp, agno, bno, len, isfl, -1, -1); + trace_xfs_free_extent(mp, agno, bno, len, type == XFS_AG_RESV_AGFL, + -1, -1); if (bno_cur) xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); if (cnt_cur) @@ -1950,21 +1966,43 @@ xfs_alloc_compute_maxlevels( } /* - * Find the length of the longest extent in an AG. + * Find the length of the longest extent in an AG. The 'need' parameter + * specifies how much space we're going to need for the AGFL and the + * 'reserved' parameter tells us how many blocks in this AG are reserved for + * other callers. */ xfs_extlen_t xfs_alloc_longest_free_extent( struct xfs_mount *mp, struct xfs_perag *pag, - xfs_extlen_t need) + xfs_extlen_t need, + xfs_extlen_t reserved) { xfs_extlen_t delta = 0; + /* + * If the AGFL needs a recharge, we'll have to subtract that from the + * longest extent. + */ if (need > pag->pagf_flcount) delta = need - pag->pagf_flcount; + /* + * If we cannot maintain others' reservations with space from the + * not-longest freesp extents, we'll have to subtract /that/ from + * the longest extent too. + */ + if (pag->pagf_freeblks - pag->pagf_longest < reserved) + delta += reserved - (pag->pagf_freeblks - pag->pagf_longest); + + /* + * If the longest extent is long enough to satisfy all the + * reservations and AGFL rules in place, we can return this extent. + */ if (pag->pagf_longest > delta) return pag->pagf_longest - delta; + + /* Otherwise, let the caller try for 1 block if there's space. */ return pag->pagf_flcount > 0 || pag->pagf_longest > 0; } @@ -2004,20 +2042,24 @@ xfs_alloc_space_available( { struct xfs_perag *pag = args->pag; xfs_extlen_t longest; + xfs_extlen_t reservation; /* blocks that are still reserved */ int available; if (flags & XFS_ALLOC_FLAG_FREEING) return true; + reservation = xfs_ag_resv_needed(pag, args->resv); + /* do we have enough contiguous free space for the allocation? */ - longest = xfs_alloc_longest_free_extent(args->mp, pag, min_free); + longest = xfs_alloc_longest_free_extent(args->mp, pag, min_free, + reservation); if ((args->minlen + args->alignment + args->minalignslop - 1) > longest) return false; - /* do have enough free space remaining for the allocation? */ + /* do we have enough free space remaining for the allocation? */ available = (int)(pag->pagf_freeblks + pag->pagf_flcount - - min_free - args->total); - if (available < (int)args->minleft) + reservation - min_free - args->total); + if (available < (int)args->minleft || available <= 0) return false; return true; @@ -2058,7 +2100,7 @@ xfs_alloc_fix_freelist( * somewhere else if we are not being asked to try harder at this * point */ - if (pag->pagf_metadata && args->userdata && + if (pag->pagf_metadata && xfs_alloc_is_userdata(args->datatype) && (flags & XFS_ALLOC_FLAG_TRYLOCK)) { ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING)); goto out_agbp_relse; @@ -2124,7 +2166,7 @@ xfs_alloc_fix_freelist( if (error) goto out_agbp_relse; error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, - &targs.oinfo, 1); + &targs.oinfo, XFS_AG_RESV_AGFL); if (error) goto out_agbp_relse; bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0); @@ -2135,7 +2177,7 @@ xfs_alloc_fix_freelist( targs.mp = mp; targs.agbp = agbp; targs.agno = args->agno; - targs.alignment = targs.minlen = targs.prod = targs.isfl = 1; + targs.alignment = targs.minlen = targs.prod = 1; targs.type = XFS_ALLOCTYPE_THIS_AG; targs.pag = pag; error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp); @@ -2146,6 +2188,7 @@ xfs_alloc_fix_freelist( while (pag->pagf_flcount < need) { targs.agbno = 0; targs.maxlen = need - pag->pagf_flcount; + targs.resv = XFS_AG_RESV_AGFL; /* Allocate as many blocks as possible at once. */ error = xfs_alloc_ag_vextent(&targs); @@ -2633,7 +2676,7 @@ xfs_alloc_vextent( * Try near allocation first, then anywhere-in-ag after * the first a.g. fails. */ - if ((args->userdata & XFS_ALLOC_INITIAL_USER_DATA) && + if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) && (mp->m_flags & XFS_MOUNT_32BITINODES)) { args->fsbno = XFS_AGB_TO_FSB(mp, ((mp->m_agfrotor / rotorstep) % @@ -2766,7 +2809,7 @@ xfs_alloc_vextent( #endif /* Zero the extent if we were asked to do so */ - if (args->userdata & XFS_ALLOC_USERDATA_ZERO) { + if (args->datatype & XFS_ALLOC_USERDATA_ZERO) { error = xfs_zero_extent(args->ip, args->fsbno, args->len); if (error) goto error0; @@ -2825,7 +2868,8 @@ xfs_free_extent( struct xfs_trans *tp, /* transaction pointer */ xfs_fsblock_t bno, /* starting block number of extent */ xfs_extlen_t len, /* length of extent */ - struct xfs_owner_info *oinfo) /* extent owner */ + struct xfs_owner_info *oinfo, /* extent owner */ + enum xfs_ag_resv_type type) /* block reservation type */ { struct xfs_mount *mp = tp->t_mountp; struct xfs_buf *agbp; @@ -2834,6 +2878,7 @@ xfs_free_extent( int error; ASSERT(len != 0); + ASSERT(type != XFS_AG_RESV_AGFL); if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_FREE_EXTENT, @@ -2851,7 +2896,7 @@ xfs_free_extent( agbno + len <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_length), err); - error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, 0); + error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, type); if (error) goto err; diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 6fe2d6b..7c404a6 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -85,20 +85,33 @@ typedef struct xfs_alloc_arg { xfs_extlen_t len; /* output: actual size of extent */ xfs_alloctype_t type; /* allocation type XFS_ALLOCTYPE_... */ xfs_alloctype_t otype; /* original allocation type */ + int datatype; /* mask defining data type treatment */ char wasdel; /* set if allocation was prev delayed */ char wasfromfl; /* set if allocation is from freelist */ - char isfl; /* set if is freelist blocks - !acctg */ - char userdata; /* mask defining userdata treatment */ xfs_fsblock_t firstblock; /* io first block allocated */ struct xfs_owner_info oinfo; /* owner of blocks being allocated */ + enum xfs_ag_resv_type resv; /* block reservation to use */ } xfs_alloc_arg_t; /* - * Defines for userdata + * Defines for datatype */ #define XFS_ALLOC_USERDATA (1 << 0)/* allocation is for user data*/ #define XFS_ALLOC_INITIAL_USER_DATA (1 << 1)/* special case start of file */ #define XFS_ALLOC_USERDATA_ZERO (1 << 2)/* zero extent on allocation */ +#define XFS_ALLOC_NOBUSY (1 << 3)/* Busy extents not allowed */ + +static inline bool +xfs_alloc_is_userdata(int datatype) +{ + return (datatype & ~XFS_ALLOC_NOBUSY) != 0; +} + +static inline bool +xfs_alloc_allow_busy_reuse(int datatype) +{ + return (datatype & XFS_ALLOC_NOBUSY) == 0; +} /* freespace limit calculations */ #define XFS_ALLOC_AGFL_RESERVE 4 @@ -106,7 +119,8 @@ unsigned int xfs_alloc_set_aside(struct xfs_mount *mp); unsigned int xfs_alloc_ag_max_usable(struct xfs_mount *mp); xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_mount *mp, - struct xfs_perag *pag, xfs_extlen_t need); + struct xfs_perag *pag, xfs_extlen_t need, + xfs_extlen_t reserved); unsigned int xfs_alloc_min_freelist(struct xfs_mount *mp, struct xfs_perag *pag); @@ -184,7 +198,8 @@ xfs_free_extent( struct xfs_trans *tp, /* transaction pointer */ xfs_fsblock_t bno, /* starting block number of extent */ xfs_extlen_t len, /* length of extent */ - struct xfs_owner_info *oinfo);/* extent owner */ + struct xfs_owner_info *oinfo, /* extent owner */ + enum xfs_ag_resv_type type); /* block reservation type */ int /* error */ xfs_alloc_lookup_ge( diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index b060bca..9d7f61d 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -47,6 +47,7 @@ #include "xfs_attr_leaf.h" #include "xfs_filestream.h" #include "xfs_rmap.h" +#include "xfs_ag_resv.h" kmem_zone_t *xfs_bmap_free_item_zone; @@ -1388,7 +1389,7 @@ xfs_bmap_search_multi_extents( * Else, *lastxp will be set to the index of the found * entry; *gotp will contain the entry. */ -STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */ +xfs_bmbt_rec_host_t * /* pointer to found extent entry */ xfs_bmap_search_extents( xfs_inode_t *ip, /* incore inode pointer */ xfs_fileoff_t bno, /* block number searched for */ @@ -3347,7 +3348,8 @@ xfs_bmap_adjacent( mp = ap->ip->i_mount; nullfb = *ap->firstblock == NULLFSBLOCK; - rt = XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata; + rt = XFS_IS_REALTIME_INODE(ap->ip) && + xfs_alloc_is_userdata(ap->datatype); fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock); /* * If allocating at eof, and there's a previous real block, @@ -3501,7 +3503,8 @@ xfs_bmap_longest_free_extent( } longest = xfs_alloc_longest_free_extent(mp, pag, - xfs_alloc_min_freelist(mp, pag)); + xfs_alloc_min_freelist(mp, pag), + xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE)); if (*blen < longest) *blen = longest; @@ -3622,7 +3625,7 @@ xfs_bmap_btalloc( { xfs_mount_t *mp; /* mount point structure */ xfs_alloctype_t atype = 0; /* type for allocation routines */ - xfs_extlen_t align; /* minimum allocation alignment */ + xfs_extlen_t align = 0; /* minimum allocation alignment */ xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */ xfs_agnumber_t ag; xfs_alloc_arg_t args; @@ -3645,7 +3648,8 @@ xfs_bmap_btalloc( else if (mp->m_dalign) stripe_align = mp->m_dalign; - align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0; + if (xfs_alloc_is_userdata(ap->datatype)) + align = xfs_get_extsz_hint(ap->ip); if (unlikely(align)) { error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, align, 0, ap->eof, 0, ap->conv, @@ -3658,7 +3662,8 @@ xfs_bmap_btalloc( nullfb = *ap->firstblock == NULLFSBLOCK; fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock); if (nullfb) { - if (ap->userdata && xfs_inode_is_filestream(ap->ip)) { + if (xfs_alloc_is_userdata(ap->datatype) && + xfs_inode_is_filestream(ap->ip)) { ag = xfs_filestream_lookup_ag(ap->ip); ag = (ag != NULLAGNUMBER) ? ag : 0; ap->blkno = XFS_AGB_TO_FSB(mp, ag, 0); @@ -3698,7 +3703,8 @@ xfs_bmap_btalloc( * enough for the request. If one isn't found, then adjust * the minimum allocation size to the largest space found. */ - if (ap->userdata && xfs_inode_is_filestream(ap->ip)) + if (xfs_alloc_is_userdata(ap->datatype) && + xfs_inode_is_filestream(ap->ip)) error = xfs_bmap_btalloc_filestreams(ap, &args, &blen); else error = xfs_bmap_btalloc_nullfb(ap, &args, &blen); @@ -3781,9 +3787,9 @@ xfs_bmap_btalloc( } args.minleft = ap->minleft; args.wasdel = ap->wasdel; - args.isfl = 0; - args.userdata = ap->userdata; - if (ap->userdata & XFS_ALLOC_USERDATA_ZERO) + args.resv = XFS_AG_RESV_NONE; + args.datatype = ap->datatype; + if (ap->datatype & XFS_ALLOC_USERDATA_ZERO) args.ip = ap->ip; error = xfs_alloc_vextent(&args); @@ -3877,7 +3883,8 @@ STATIC int xfs_bmap_alloc( struct xfs_bmalloca *ap) /* bmap alloc argument struct */ { - if (XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata) + if (XFS_IS_REALTIME_INODE(ap->ip) && + xfs_alloc_is_userdata(ap->datatype)) return xfs_bmap_rtalloc(ap); return xfs_bmap_btalloc(ap); } @@ -4074,7 +4081,7 @@ xfs_bmapi_read( return 0; } -STATIC int +int xfs_bmapi_reserve_delalloc( struct xfs_inode *ip, xfs_fileoff_t aoff, @@ -4170,91 +4177,6 @@ out_unreserve_quota: return error; } -/* - * Map file blocks to filesystem blocks, adding delayed allocations as needed. - */ -int -xfs_bmapi_delay( - struct xfs_inode *ip, /* incore inode */ - xfs_fileoff_t bno, /* starting file offs. mapped */ - xfs_filblks_t len, /* length to map in file */ - struct xfs_bmbt_irec *mval, /* output: map values */ - int *nmap, /* i/o: mval size/count */ - int flags) /* XFS_BMAPI_... */ -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); - struct xfs_bmbt_irec got; /* current file extent record */ - struct xfs_bmbt_irec prev; /* previous file extent record */ - xfs_fileoff_t obno; /* old block number (offset) */ - xfs_fileoff_t end; /* end of mapped file region */ - xfs_extnum_t lastx; /* last useful extent number */ - int eof; /* we've hit the end of extents */ - int n = 0; /* current extent index */ - int error = 0; - - ASSERT(*nmap >= 1); - ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); - ASSERT(!(flags & ~XFS_BMAPI_ENTIRE)); - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - - if (unlikely(XFS_TEST_ERROR( - (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE), - mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { - XFS_ERROR_REPORT("xfs_bmapi_delay", XFS_ERRLEVEL_LOW, mp); - return -EFSCORRUPTED; - } - - if (XFS_FORCED_SHUTDOWN(mp)) - return -EIO; - - XFS_STATS_INC(mp, xs_blk_mapw); - - if (!(ifp->if_flags & XFS_IFEXTENTS)) { - error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); - if (error) - return error; - } - - xfs_bmap_search_extents(ip, bno, XFS_DATA_FORK, &eof, &lastx, &got, &prev); - end = bno + len; - obno = bno; - - while (bno < end && n < *nmap) { - if (eof || got.br_startoff > bno) { - error = xfs_bmapi_reserve_delalloc(ip, bno, len, &got, - &prev, &lastx, eof); - if (error) { - if (n == 0) { - *nmap = 0; - return error; - } - break; - } - } - - /* set up the extent map to return. */ - xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags); - xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags); - - /* If we're done, stop now. */ - if (bno >= end || n >= *nmap) - break; - - /* Else go on to the next record. */ - prev = got; - if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got); - else - eof = 1; - } - - *nmap = n; - return 0; -} - - static int xfs_bmapi_allocate( struct xfs_bmalloca *bma) @@ -4287,15 +4209,21 @@ xfs_bmapi_allocate( } /* - * Indicate if this is the first user data in the file, or just any - * user data. And if it is userdata, indicate whether it needs to - * be initialised to zero during allocation. + * Set the data type being allocated. For the data fork, the first data + * in the file is treated differently to all other allocations. For the + * attribute fork, we only need to ensure the allocated range is not on + * the busy list. */ if (!(bma->flags & XFS_BMAPI_METADATA)) { - bma->userdata = (bma->offset == 0) ? - XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA; + bma->datatype = XFS_ALLOC_NOBUSY; + if (whichfork == XFS_DATA_FORK) { + if (bma->offset == 0) + bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA; + else + bma->datatype |= XFS_ALLOC_USERDATA; + } if (bma->flags & XFS_BMAPI_ZERO) - bma->userdata |= XFS_ALLOC_USERDATA_ZERO; + bma->datatype |= XFS_ALLOC_USERDATA_ZERO; } bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1; @@ -4565,7 +4493,7 @@ xfs_bmapi_write( bma.tp = tp; bma.ip = ip; bma.total = total; - bma.userdata = 0; + bma.datatype = 0; bma.dfops = dfops; bma.firstblock = firstblock; diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 254034f..8395f6e 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -54,7 +54,7 @@ struct xfs_bmalloca { bool wasdel; /* replacing a delayed allocation */ bool aeof; /* allocated space at eof */ bool conv; /* overwriting unwritten extents */ - char userdata;/* userdata mask */ + int datatype;/* data type being allocated */ int flags; }; @@ -181,9 +181,6 @@ int xfs_bmap_read_extents(struct xfs_trans *tp, struct xfs_inode *ip, int xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, struct xfs_bmbt_irec *mval, int *nmap, int flags); -int xfs_bmapi_delay(struct xfs_inode *ip, xfs_fileoff_t bno, - xfs_filblks_t len, struct xfs_bmbt_irec *mval, - int *nmap, int flags); int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, int flags, xfs_fsblock_t *firstblock, xfs_extlen_t total, @@ -202,5 +199,12 @@ int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip, struct xfs_defer_ops *dfops, enum shift_direction direction, int num_exts); int xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset); +struct xfs_bmbt_rec_host * + xfs_bmap_search_extents(struct xfs_inode *ip, xfs_fileoff_t bno, + int fork, int *eofp, xfs_extnum_t *lastxp, + struct xfs_bmbt_irec *gotp, struct xfs_bmbt_irec *prevp); +int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, xfs_fileoff_t aoff, + xfs_filblks_t len, struct xfs_bmbt_irec *got, + struct xfs_bmbt_irec *prev, xfs_extnum_t *lastx, int eof); #endif /* __XFS_BMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 0856979..aa1752f 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -2070,7 +2070,7 @@ __xfs_btree_updkeys( struct xfs_buf *bp0, bool force_all) { - union xfs_btree_bigkey key; /* keys from current level */ + union xfs_btree_key key; /* keys from current level */ union xfs_btree_key *lkey; /* keys from the next level up */ union xfs_btree_key *hkey; union xfs_btree_key *nlkey; /* keys from the next level up */ @@ -2086,7 +2086,7 @@ __xfs_btree_updkeys( trace_xfs_btree_updkeys(cur, level, bp0); - lkey = (union xfs_btree_key *)&key; + lkey = &key; hkey = xfs_btree_high_key_from_key(cur, lkey); xfs_btree_get_keys(cur, block, lkey); for (level++; level < cur->bc_nlevels; level++) { @@ -3226,7 +3226,7 @@ xfs_btree_insrec( struct xfs_buf *bp; /* buffer for block */ union xfs_btree_ptr nptr; /* new block ptr */ struct xfs_btree_cur *ncur; /* new btree cursor */ - union xfs_btree_bigkey nkey; /* new block key */ + union xfs_btree_key nkey; /* new block key */ union xfs_btree_key *lkey; int optr; /* old key/record index */ int ptr; /* key/record index */ @@ -3241,7 +3241,7 @@ xfs_btree_insrec( XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, &rec); ncur = NULL; - lkey = (union xfs_btree_key *)&nkey; + lkey = &nkey; /* * If we have an external root pointer, and we've made it to the @@ -3444,14 +3444,14 @@ xfs_btree_insert( union xfs_btree_ptr nptr; /* new block number (split result) */ struct xfs_btree_cur *ncur; /* new cursor (split result) */ struct xfs_btree_cur *pcur; /* previous level's cursor */ - union xfs_btree_bigkey bkey; /* key of block to insert */ + union xfs_btree_key bkey; /* key of block to insert */ union xfs_btree_key *key; union xfs_btree_rec rec; /* record to insert */ level = 0; ncur = NULL; pcur = cur; - key = (union xfs_btree_key *)&bkey; + key = &bkey; xfs_btree_set_ptr_null(cur, &nptr); @@ -4797,3 +4797,50 @@ xfs_btree_query_range( return xfs_btree_overlapped_query_range(cur, &low_key, &high_key, fn, priv); } + +/* + * Calculate the number of blocks needed to store a given number of records + * in a short-format (per-AG metadata) btree. + */ +xfs_extlen_t +xfs_btree_calc_size( + struct xfs_mount *mp, + uint *limits, + unsigned long long len) +{ + int level; + int maxrecs; + xfs_extlen_t rval; + + maxrecs = limits[0]; + for (level = 0, rval = 0; len > 1; level++) { + len += maxrecs - 1; + do_div(len, maxrecs); + maxrecs = limits[1]; + rval += len; + } + return rval; +} + +int +xfs_btree_count_blocks_helper( + struct xfs_btree_cur *cur, + int level, + void *data) +{ + xfs_extlen_t *blocks = data; + (*blocks)++; + + return 0; +} + +/* Count the blocks in a btree and return the result in *blocks. */ +int +xfs_btree_count_blocks( + struct xfs_btree_cur *cur, + xfs_extlen_t *blocks) +{ + *blocks = 0; + return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper, + blocks); +} diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 04d0865..3f8556a 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -37,30 +37,18 @@ union xfs_btree_ptr { __be64 l; /* long form ptr */ }; -union xfs_btree_key { - struct xfs_bmbt_key bmbt; - xfs_bmdr_key_t bmbr; /* bmbt root block */ - xfs_alloc_key_t alloc; - struct xfs_inobt_key inobt; - struct xfs_rmap_key rmap; -}; - /* - * In-core key that holds both low and high keys for overlapped btrees. - * The two keys are packed next to each other on disk, so do the same - * in memory. Preserve the existing xfs_btree_key as a single key to - * avoid the mental model breakage that would happen if we passed a - * bigkey into a function that operates on a single key. + * The in-core btree key. Overlapping btrees actually store two keys + * per pointer, so we reserve enough memory to hold both. The __*bigkey + * items should never be accessed directly. */ -union xfs_btree_bigkey { +union xfs_btree_key { struct xfs_bmbt_key bmbt; xfs_bmdr_key_t bmbr; /* bmbt root block */ xfs_alloc_key_t alloc; struct xfs_inobt_key inobt; - struct { - struct xfs_rmap_key rmap; - struct xfs_rmap_key rmap_hi; - }; + struct xfs_rmap_key rmap; + struct xfs_rmap_key __rmap_bigkey[2]; }; union xfs_btree_rec { @@ -513,6 +501,8 @@ bool xfs_btree_sblock_v5hdr_verify(struct xfs_buf *bp); bool xfs_btree_sblock_verify(struct xfs_buf *bp, unsigned int max_recs); uint xfs_btree_compute_maxlevels(struct xfs_mount *mp, uint *limits, unsigned long len); +xfs_extlen_t xfs_btree_calc_size(struct xfs_mount *mp, uint *limits, + unsigned long long len); /* return codes */ #define XFS_BTREE_QUERY_RANGE_CONTINUE 0 /* keep iterating */ @@ -529,4 +519,6 @@ typedef int (*xfs_btree_visit_blocks_fn)(struct xfs_btree_cur *cur, int level, int xfs_btree_visit_blocks(struct xfs_btree_cur *cur, xfs_btree_visit_blocks_fn fn, void *data); +int xfs_btree_count_blocks(struct xfs_btree_cur *cur, xfs_extlen_t *blocks); + #endif /* __XFS_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index c221d0e..613c5cf1 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -81,6 +81,10 @@ * - For each work item attached to the log intent item, * * Perform the described action. * * Attach the work item to the log done item. + * * If the result of doing the work was -EAGAIN, ->finish work + * wants a new transaction. See the "Requesting a Fresh + * Transaction while Finishing Deferred Work" section below for + * details. * * The key here is that we must log an intent item for all pending * work items every time we roll the transaction, and that we must log @@ -88,6 +92,34 @@ * we can perform complex remapping operations, chaining intent items * as needed. * + * Requesting a Fresh Transaction while Finishing Deferred Work + * + * If ->finish_item decides that it needs a fresh transaction to + * finish the work, it must ask its caller (xfs_defer_finish) for a + * continuation. The most likely cause of this circumstance are the + * refcount adjust functions deciding that they've logged enough items + * to be at risk of exceeding the transaction reservation. + * + * To get a fresh transaction, we want to log the existing log done + * item to prevent the log intent item from replaying, immediately log + * a new log intent item with the unfinished work items, roll the + * transaction, and re-call ->finish_item wherever it left off. The + * log done item and the new log intent item must be in the same + * transaction or atomicity cannot be guaranteed; defer_finish ensures + * that this happens. + * + * This requires some coordination between ->finish_item and + * defer_finish. Upon deciding to request a new transaction, + * ->finish_item should update the current work item to reflect the + * unfinished work. Next, it should reset the log done item's list + * count to the number of items finished, and return -EAGAIN. + * defer_finish sees the -EAGAIN, logs the new log intent item + * with the remaining work items, and leaves the xfs_defer_pending + * item at the head of the dop_work queue. Then it rolls the + * transaction and picks up processing where it left off. It is + * required that ->finish_item must be careful to leave enough + * transaction reservation to fit the new log intent item. + * * This is an example of remapping the extent (E, E+B) into file X at * offset A and dealing with the extent (C, C+B) already being mapped * there: @@ -104,21 +136,26 @@ * | Intent to add rmap (X, E, A, B) | * +-------------------------------------------------+ * | Reduce refcount for extent (C, B) | t2 - * | Done reducing refcount for extent (C, B) | + * | Done reducing refcount for extent (C, 9) | + * | Intent to reduce refcount for extent (C+9, B-9) | + * | (ran out of space after 9 refcount updates) | + * +-------------------------------------------------+ + * | Reduce refcount for extent (C+9, B+9) | t3 + * | Done reducing refcount for extent (C+9, B-9) | * | Increase refcount for extent (E, B) | * | Done increasing refcount for extent (E, B) | * | Intent to free extent (C, B) | * | Intent to free extent (F, 1) (refcountbt block) | * | Intent to remove rmap (F, 1, REFC) | * +-------------------------------------------------+ - * | Remove rmap (X, C, A, B) | t3 + * | Remove rmap (X, C, A, B) | t4 * | Done removing rmap (X, C, A, B) | * | Add rmap (X, E, A, B) | * | Done adding rmap (X, E, A, B) | * | Remove rmap (F, 1, REFC) | * | Done removing rmap (F, 1, REFC) | * +-------------------------------------------------+ - * | Free extent (C, B) | t4 + * | Free extent (C, B) | t5 * | Done freeing extent (C, B) | * | Free extent (D, 1) | * | Done freeing extent (D, 1) | @@ -141,6 +178,9 @@ * - Intent to free extent (C, B) * - Intent to free extent (F, 1) (refcountbt block) * - Intent to remove rmap (F, 1, REFC) + * + * Note that the continuation requested between t2 and t3 is likely to + * reoccur. */ static const struct xfs_defer_op_type *defer_op_types[XFS_DEFER_OPS_TYPE_MAX]; @@ -323,7 +363,16 @@ xfs_defer_finish( dfp->dfp_count--; error = dfp->dfp_type->finish_item(*tp, dop, li, dfp->dfp_done, &state); - if (error) { + if (error == -EAGAIN) { + /* + * Caller wants a fresh transaction; + * put the work item back on the list + * and jump out. + */ + list_add(li, &dfp->dfp_work); + dfp->dfp_count++; + break; + } else if (error) { /* * Clean up after ourselves and jump out. * xfs_defer_cancel will take care of freeing @@ -335,9 +384,25 @@ xfs_defer_finish( goto out; } } - /* Done with the dfp, free it. */ - list_del(&dfp->dfp_list); - kmem_free(dfp); + if (error == -EAGAIN) { + /* + * Caller wants a fresh transaction, so log a + * new log intent item to replace the old one + * and roll the transaction. See "Requesting + * a Fresh Transaction while Finishing + * Deferred Work" above. + */ + dfp->dfp_intent = dfp->dfp_type->create_intent(*tp, + dfp->dfp_count); + dfp->dfp_done = NULL; + list_for_each(li, &dfp->dfp_work) + dfp->dfp_type->log_item(*tp, dfp->dfp_intent, + li); + } else { + /* Done with the dfp, free it. */ + list_del(&dfp->dfp_list); + kmem_free(dfp); + } if (cleanup_fn) cleanup_fn(*tp, state, error); diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 31ca220..eab68ae 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -132,7 +132,7 @@ xfs_inobt_free_block( xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INOBT); return xfs_free_extent(cur->bc_tp, XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp)), 1, - &oinfo); + &oinfo, XFS_AG_RESV_NONE); } STATIC int diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index a6eed43..fc5eef8 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -647,9 +647,17 @@ struct xfs_rui_log_format { __uint16_t rui_size; /* size of this item */ __uint32_t rui_nextents; /* # extents to free */ __uint64_t rui_id; /* rui identifier */ - struct xfs_map_extent rui_extents[1]; /* array of extents to rmap */ + struct xfs_map_extent rui_extents[]; /* array of extents to rmap */ }; +static inline size_t +xfs_rui_log_format_sizeof( + unsigned int nr) +{ + return sizeof(struct xfs_rui_log_format) + + nr * sizeof(struct xfs_map_extent); +} + /* * This is the structure used to lay out an rud log item in the * log. The rud_extents array is a variable size array whose diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 7575cfc..4a28fa9 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -200,7 +200,7 @@ xfs_setfilesize_trans_alloc( * Update on-disk file size now that data has been written to disk. */ STATIC int -xfs_setfilesize( +__xfs_setfilesize( struct xfs_inode *ip, struct xfs_trans *tp, xfs_off_t offset, @@ -225,6 +225,23 @@ xfs_setfilesize( return xfs_trans_commit(tp); } +int +xfs_setfilesize( + struct xfs_inode *ip, + xfs_off_t offset, + size_t size) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int error; + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp); + if (error) + return error; + + return __xfs_setfilesize(ip, tp, offset, size); +} + STATIC int xfs_setfilesize_ioend( struct xfs_ioend *ioend, @@ -247,7 +264,7 @@ xfs_setfilesize_ioend( return error; } - return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size); + return __xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size); } /* @@ -1336,13 +1353,12 @@ xfs_end_io_direct_write( { struct inode *inode = file_inode(iocb->ki_filp); struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; uintptr_t flags = (uintptr_t)private; int error = 0; trace_xfs_end_io_direct_write(ip, offset, size); - if (XFS_FORCED_SHUTDOWN(mp)) + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; if (size <= 0) @@ -1380,14 +1396,9 @@ xfs_end_io_direct_write( error = xfs_iomap_write_unwritten(ip, offset, size); } else if (flags & XFS_DIO_FLAG_APPEND) { - struct xfs_trans *tp; - trace_xfs_end_io_direct_write_append(ip, offset, size); - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, - &tp); - if (!error) - error = xfs_setfilesize(ip, tp, offset, size); + error = xfs_setfilesize(ip, offset, size); } return error; diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index bf2d9a1..1950e3b 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -62,6 +62,7 @@ int xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset, int xfs_end_io_direct_write(struct kiocb *iocb, loff_t offset, ssize_t size, void *private); +int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size); extern void xfs_count_page_state(struct page *, int *, int *); extern struct block_device *xfs_find_bdev_for_inode(struct inode *); diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 4ece4f2..e827d65 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -182,7 +182,7 @@ xfs_bmap_rtalloc( XFS_TRANS_DQ_RTBCOUNT, (long) ralen); /* Zero the extent if we were asked to do so */ - if (ap->userdata & XFS_ALLOC_USERDATA_ZERO) { + if (ap->datatype & XFS_ALLOC_USERDATA_ZERO) { error = xfs_zero_extent(ap->ip, ap->blkno, ap->length); if (error) return error; diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index e455f90..2975cb2 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -865,7 +865,7 @@ xfs_buf_item_log_segment( */ if (bit) { end_bit = MIN(bit + bits_to_set, (uint)NBWORD); - mask = ((1 << (end_bit - bit)) - 1) << bit; + mask = ((1U << (end_bit - bit)) - 1) << bit; *wordp |= mask; wordp++; bits_set = end_bit - bit; @@ -888,7 +888,7 @@ xfs_buf_item_log_segment( */ end_bit = bits_to_set - bits_set; if (end_bit) { - mask = (1 << end_bit) - 1; + mask = (1U << end_bit) - 1; *wordp |= mask; } } @@ -1095,7 +1095,8 @@ xfs_buf_iodone_callback_error( bp->b_last_error != bp->b_error) { bp->b_flags |= (XBF_WRITE | XBF_DONE | XBF_WRITE_FAIL); bp->b_last_error = bp->b_error; - if (cfg->retry_timeout && !bp->b_first_retry_time) + if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && + !bp->b_first_retry_time) bp->b_first_retry_time = jiffies; xfs_buf_ioerror(bp, 0); @@ -1111,7 +1112,7 @@ xfs_buf_iodone_callback_error( if (cfg->max_retries != XFS_ERR_RETRY_FOREVER && ++bp->b_retries > cfg->max_retries) goto permanent_error; - if (cfg->retry_timeout && + if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time)) goto permanent_error; diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index c263e07..162dc18 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -384,7 +384,7 @@ restart: * If this is a metadata allocation, try to reuse the busy * extent instead of trimming the allocation. */ - if (!args->userdata && + if (!xfs_alloc_is_userdata(args->datatype) && !(busyp->flags & XFS_EXTENT_BUSY_DISCARDED)) { if (!xfs_extent_busy_update_extent(args->mp, args->pag, busyp, fbno, flen, diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index e612a02..c68517b 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -269,6 +269,8 @@ xfs_file_dio_aio_read( return -EINVAL; } + file_accessed(iocb->ki_filp); + /* * Locking is a bit tricky here. If we take an exclusive lock for direct * IO, we effectively serialise all new concurrent read IO to this file @@ -323,7 +325,6 @@ xfs_file_dio_aio_read( } xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); - file_accessed(iocb->ki_filp); return ret; } @@ -332,10 +333,7 @@ xfs_file_dax_read( struct kiocb *iocb, struct iov_iter *to) { - struct address_space *mapping = iocb->ki_filp->f_mapping; - struct inode *inode = mapping->host; - struct xfs_inode *ip = XFS_I(inode); - struct iov_iter data = *to; + struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); size_t count = iov_iter_count(to); ssize_t ret = 0; @@ -345,11 +343,7 @@ xfs_file_dax_read( return 0; /* skip atime */ xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); - ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, NULL, 0); - if (ret > 0) { - iocb->ki_pos += ret; - iov_iter_advance(to, ret); - } + ret = iomap_dax_rw(iocb, to, &xfs_iomap_ops); xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); file_accessed(iocb->ki_filp); @@ -711,70 +705,32 @@ xfs_file_dax_write( struct kiocb *iocb, struct iov_iter *from) { - struct address_space *mapping = iocb->ki_filp->f_mapping; - struct inode *inode = mapping->host; + struct inode *inode = iocb->ki_filp->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - ssize_t ret = 0; - int unaligned_io = 0; - int iolock; - struct iov_iter data; + int iolock = XFS_IOLOCK_EXCL; + ssize_t ret, error = 0; + size_t count; + loff_t pos; - /* "unaligned" here means not aligned to a filesystem block */ - if ((iocb->ki_pos & mp->m_blockmask) || - ((iocb->ki_pos + iov_iter_count(from)) & mp->m_blockmask)) { - unaligned_io = 1; - iolock = XFS_IOLOCK_EXCL; - } else if (mapping->nrpages) { - iolock = XFS_IOLOCK_EXCL; - } else { - iolock = XFS_IOLOCK_SHARED; - } xfs_rw_ilock(ip, iolock); - ret = xfs_file_aio_write_checks(iocb, from, &iolock); if (ret) goto out; - /* - * Yes, even DAX files can have page cache attached to them: A zeroed - * page is inserted into the pagecache when we have to serve a write - * fault on a hole. It should never be dirtied and can simply be - * dropped from the pagecache once we get real data for the page. - * - * XXX: This is racy against mmap, and there's nothing we can do about - * it. dax_do_io() should really do this invalidation internally as - * it will know if we've allocated over a holei for this specific IO and - * if so it needs to update the mapping tree and invalidate existing - * PTEs over the newly allocated range. Remove this invalidation when - * dax_do_io() is fixed up. - */ - if (mapping->nrpages) { - loff_t end = iocb->ki_pos + iov_iter_count(from) - 1; + pos = iocb->ki_pos; + count = iov_iter_count(from); - ret = invalidate_inode_pages2_range(mapping, - iocb->ki_pos >> PAGE_SHIFT, - end >> PAGE_SHIFT); - WARN_ON_ONCE(ret); - } + trace_xfs_file_dax_write(ip, count, pos); - if (iolock == XFS_IOLOCK_EXCL && !unaligned_io) { - xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); - iolock = XFS_IOLOCK_SHARED; + ret = iomap_dax_rw(iocb, from, &xfs_iomap_ops); + if (ret > 0 && iocb->ki_pos > i_size_read(inode)) { + i_size_write(inode, iocb->ki_pos); + error = xfs_setfilesize(ip, pos, ret); } - trace_xfs_file_dax_write(ip, iov_iter_count(from), iocb->ki_pos); - - data = *from; - ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, - xfs_end_io_direct_write, 0); - if (ret > 0) { - iocb->ki_pos += ret; - iov_iter_advance(from, ret); - } out: xfs_rw_iunlock(ip, iolock); - return ret; + return error ? error : ret; } STATIC ssize_t @@ -1513,7 +1469,7 @@ xfs_filemap_page_mkwrite( xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (IS_DAX(inode)) { - ret = dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault); + ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops); } else { ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops); ret = block_page_mkwrite_return(ret); @@ -1547,7 +1503,7 @@ xfs_filemap_fault( * changes to xfs_get_blocks_direct() to map unwritten extent * ioend for conversion on read-only mappings. */ - ret = dax_fault(vma, vmf, xfs_get_blocks_dax_fault); + ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops); } else ret = filemap_fault(vma, vmf); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index 4a33a33..043ca380 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -30,6 +30,7 @@ #include "xfs_mru_cache.h" #include "xfs_filestream.h" #include "xfs_trace.h" +#include "xfs_ag_resv.h" struct xfs_fstrm_item { struct xfs_mru_cache_elem mru; @@ -198,7 +199,8 @@ xfs_filestream_pick_ag( } longest = xfs_alloc_longest_free_extent(mp, pag, - xfs_alloc_min_freelist(mp, pag)); + xfs_alloc_min_freelist(mp, pag), + xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE)); if (((minlen && longest >= minlen) || (!minlen && pag->pagf_freeblks >= minfree)) && (!pag->pagf_metadata || !(flags & XFS_PICK_USERDATA) || @@ -369,7 +371,8 @@ xfs_filestream_new_ag( struct xfs_mount *mp = ip->i_mount; xfs_extlen_t minlen = ap->length; xfs_agnumber_t startag = 0; - int flags, err = 0; + int flags = 0; + int err = 0; struct xfs_mru_cache_elem *mru; *agp = NULLAGNUMBER; @@ -385,8 +388,10 @@ xfs_filestream_new_ag( startag = (item->ag + 1) % mp->m_sb.sb_agcount; } - flags = (ap->userdata ? XFS_PICK_USERDATA : 0) | - (ap->dfops->dop_low ? XFS_PICK_LOWSPACE : 0); + if (xfs_alloc_is_userdata(ap->datatype)) + flags |= XFS_PICK_USERDATA; + if (ap->dfops->dop_low) + flags |= XFS_PICK_LOWSPACE; err = xfs_filestream_pick_ag(pip, startag, agp, flags, minlen); diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 0b7f986..94ac06f 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -553,7 +553,7 @@ xfs_growfs_data_private( error = xfs_free_extent(tp, XFS_AGB_TO_FSB(mp, agno, be32_to_cpu(agf->agf_length) - new), - new, &oinfo); + new, &oinfo, XFS_AG_RESV_NONE); if (error) goto error0; } diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index fb39a66..65b2e3f 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1414,6 +1414,16 @@ xfs_inode_set_eofblocks_tag( struct xfs_perag *pag; int tagged; + /* + * Don't bother locking the AG and looking up in the radix trees + * if we already know that we have the tag set. + */ + if (ip->i_flags & XFS_IEOFBLOCKS) + return; + spin_lock(&ip->i_flags_lock); + ip->i_flags |= XFS_IEOFBLOCKS; + spin_unlock(&ip->i_flags_lock); + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); spin_lock(&pag->pag_ici_lock); trace_xfs_inode_set_eofblocks_tag(ip); @@ -1449,6 +1459,10 @@ xfs_inode_clear_eofblocks_tag( struct xfs_mount *mp = ip->i_mount; struct xfs_perag *pag; + spin_lock(&ip->i_flags_lock); + ip->i_flags &= ~XFS_IEOFBLOCKS; + spin_unlock(&ip->i_flags_lock); + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); spin_lock(&pag->pag_ici_lock); trace_xfs_inode_clear_eofblocks_tag(ip); diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index e1a411e..8f30d25 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -216,6 +216,7 @@ xfs_get_initial_prid(struct xfs_inode *dp) #define __XFS_IPINNED_BIT 8 /* wakeup key for zero pin count */ #define XFS_IPINNED (1 << __XFS_IPINNED_BIT) #define XFS_IDONTCACHE (1 << 9) /* don't cache the inode long term */ +#define XFS_IEOFBLOCKS (1 << 10)/* has the preallocblocks tag set */ /* * Per-lifetime flags need to be reset when re-using a reclaimable inode during diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 2af0dda..c08253e 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * Copyright (c) 2016 Christoph Hellwig. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -42,17 +43,40 @@ #define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \ << mp->m_writeio_log) -#define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP -STATIC int -xfs_iomap_eof_align_last_fsb( - xfs_mount_t *mp, - xfs_inode_t *ip, - xfs_extlen_t extsize, - xfs_fileoff_t *last_fsb) +void +xfs_bmbt_to_iomap( + struct xfs_inode *ip, + struct iomap *iomap, + struct xfs_bmbt_irec *imap) +{ + struct xfs_mount *mp = ip->i_mount; + + if (imap->br_startblock == HOLESTARTBLOCK) { + iomap->blkno = IOMAP_NULL_BLOCK; + iomap->type = IOMAP_HOLE; + } else if (imap->br_startblock == DELAYSTARTBLOCK) { + iomap->blkno = IOMAP_NULL_BLOCK; + iomap->type = IOMAP_DELALLOC; + } else { + iomap->blkno = xfs_fsb_to_db(ip, imap->br_startblock); + if (imap->br_state == XFS_EXT_UNWRITTEN) + iomap->type = IOMAP_UNWRITTEN; + else + iomap->type = IOMAP_MAPPED; + } + iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff); + iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount); + iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip)); +} + +static xfs_extlen_t +xfs_eof_alignment( + struct xfs_inode *ip, + xfs_extlen_t extsize) { - xfs_extlen_t align = 0; - int eof, error; + struct xfs_mount *mp = ip->i_mount; + xfs_extlen_t align = 0; if (!XFS_IS_REALTIME_INODE(ip)) { /* @@ -83,8 +107,21 @@ xfs_iomap_eof_align_last_fsb( align = extsize; } + return align; +} + +STATIC int +xfs_iomap_eof_align_last_fsb( + struct xfs_inode *ip, + xfs_extlen_t extsize, + xfs_fileoff_t *last_fsb) +{ + xfs_extlen_t align = xfs_eof_alignment(ip, extsize); + if (align) { xfs_fileoff_t new_last_fsb = roundup_64(*last_fsb, align); + int eof, error; + error = xfs_bmap_eof(ip, new_last_fsb, XFS_DATA_FORK, &eof); if (error) return error; @@ -154,7 +191,7 @@ xfs_iomap_write_direct( */ ASSERT(XFS_IFORK_PTR(ip, XFS_DATA_FORK)->if_flags & XFS_IFEXTENTS); - error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb); + error = xfs_iomap_eof_align_last_fsb(ip, extsz, &last_fsb); if (error) goto out_unlock; } else { @@ -274,130 +311,6 @@ out_trans_cancel: goto out_unlock; } -/* - * If the caller is doing a write at the end of the file, then extend the - * allocation out to the file system's write iosize. We clean up any extra - * space left over when the file is closed in xfs_inactive(). - * - * If we find we already have delalloc preallocation beyond EOF, don't do more - * preallocation as it it not needed. - */ -STATIC int -xfs_iomap_eof_want_preallocate( - xfs_mount_t *mp, - xfs_inode_t *ip, - xfs_off_t offset, - size_t count, - xfs_bmbt_irec_t *imap, - int nimaps, - int *prealloc) -{ - xfs_fileoff_t start_fsb; - xfs_filblks_t count_fsb; - int n, error, imaps; - int found_delalloc = 0; - - *prealloc = 0; - if (offset + count <= XFS_ISIZE(ip)) - return 0; - - /* - * If the file is smaller than the minimum prealloc and we are using - * dynamic preallocation, don't do any preallocation at all as it is - * likely this is the only write to the file that is going to be done. - */ - if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) && - XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_writeio_blocks)) - return 0; - - /* - * If there are any real blocks past eof, then don't - * do any speculative allocation. - */ - start_fsb = XFS_B_TO_FSBT(mp, ((xfs_ufsize_t)(offset + count - 1))); - count_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); - while (count_fsb > 0) { - imaps = nimaps; - error = xfs_bmapi_read(ip, start_fsb, count_fsb, imap, &imaps, - 0); - if (error) - return error; - for (n = 0; n < imaps; n++) { - if ((imap[n].br_startblock != HOLESTARTBLOCK) && - (imap[n].br_startblock != DELAYSTARTBLOCK)) - return 0; - start_fsb += imap[n].br_blockcount; - count_fsb -= imap[n].br_blockcount; - - if (imap[n].br_startblock == DELAYSTARTBLOCK) - found_delalloc = 1; - } - } - if (!found_delalloc) - *prealloc = 1; - return 0; -} - -/* - * Determine the initial size of the preallocation. We are beyond the current - * EOF here, but we need to take into account whether this is a sparse write or - * an extending write when determining the preallocation size. Hence we need to - * look up the extent that ends at the current write offset and use the result - * to determine the preallocation size. - * - * If the extent is a hole, then preallocation is essentially disabled. - * Otherwise we take the size of the preceeding data extent as the basis for the - * preallocation size. If the size of the extent is greater than half the - * maximum extent length, then use the current offset as the basis. This ensures - * that for large files the preallocation size always extends to MAXEXTLEN - * rather than falling short due to things like stripe unit/width alignment of - * real extents. - */ -STATIC xfs_fsblock_t -xfs_iomap_eof_prealloc_initial_size( - struct xfs_mount *mp, - struct xfs_inode *ip, - xfs_off_t offset, - xfs_bmbt_irec_t *imap, - int nimaps) -{ - xfs_fileoff_t start_fsb; - int imaps = 1; - int error; - - ASSERT(nimaps >= imaps); - - /* if we are using a specific prealloc size, return now */ - if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) - return 0; - - /* If the file is small, then use the minimum prealloc */ - if (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign)) - return 0; - - /* - * As we write multiple pages, the offset will always align to the - * start of a page and hence point to a hole at EOF. i.e. if the size is - * 4096 bytes, we only have one block at FSB 0, but XFS_B_TO_FSB(4096) - * will return FSB 1. Hence if there are blocks in the file, we want to - * point to the block prior to the EOF block and not the hole that maps - * directly at @offset. - */ - start_fsb = XFS_B_TO_FSB(mp, offset); - if (start_fsb) - start_fsb--; - error = xfs_bmapi_read(ip, start_fsb, 1, imap, &imaps, XFS_BMAPI_ENTIRE); - if (error) - return 0; - - ASSERT(imaps == 1); - if (imap[0].br_startblock == HOLESTARTBLOCK) - return 0; - if (imap[0].br_blockcount <= (MAXEXTLEN >> 1)) - return imap[0].br_blockcount << 1; - return XFS_B_TO_FSB(mp, offset); -} - STATIC bool xfs_quota_need_throttle( struct xfs_inode *ip, @@ -459,27 +372,76 @@ xfs_quota_calc_throttle( } /* + * If we are doing a write at the end of the file and there are no allocations + * past this one, then extend the allocation out to the file system's write + * iosize. + * * If we don't have a user specified preallocation size, dynamically increase - * the preallocation size as the size of the file grows. Cap the maximum size + * the preallocation size as the size of the file grows. Cap the maximum size * at a single extent or less if the filesystem is near full. The closer the * filesystem is to full, the smaller the maximum prealocation. + * + * As an exception we don't do any preallocation at all if the file is smaller + * than the minimum preallocation and we are using the default dynamic + * preallocation scheme, as it is likely this is the only write to the file that + * is going to be done. + * + * We clean up any extra space left over when the file is closed in + * xfs_inactive(). */ STATIC xfs_fsblock_t xfs_iomap_prealloc_size( - struct xfs_mount *mp, struct xfs_inode *ip, - xfs_off_t offset, - struct xfs_bmbt_irec *imap, - int nimaps) + loff_t offset, + loff_t count, + xfs_extnum_t idx, + struct xfs_bmbt_irec *prev) { - xfs_fsblock_t alloc_blocks = 0; + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); int shift = 0; int64_t freesp; xfs_fsblock_t qblocks; int qshift = 0; + xfs_fsblock_t alloc_blocks = 0; + + if (offset + count <= XFS_ISIZE(ip)) + return 0; - alloc_blocks = xfs_iomap_eof_prealloc_initial_size(mp, ip, offset, - imap, nimaps); + if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) && + (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_writeio_blocks))) + return 0; + + /* + * If an explicit allocsize is set, the file is small, or we + * are writing behind a hole, then use the minimum prealloc: + */ + if ((mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) || + XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign) || + idx == 0 || + prev->br_startoff + prev->br_blockcount < offset_fsb) + return mp->m_writeio_blocks; + + /* + * Determine the initial size of the preallocation. We are beyond the + * current EOF here, but we need to take into account whether this is + * a sparse write or an extending write when determining the + * preallocation size. Hence we need to look up the extent that ends + * at the current write offset and use the result to determine the + * preallocation size. + * + * If the extent is a hole, then preallocation is essentially disabled. + * Otherwise we take the size of the preceding data extent as the basis + * for the preallocation size. If the size of the extent is greater than + * half the maximum extent length, then use the current offset as the + * basis. This ensures that for large files the preallocation size + * always extends to MAXEXTLEN rather than falling short due to things + * like stripe unit/width alignment of real extents. + */ + if (prev->br_blockcount <= (MAXEXTLEN >> 1)) + alloc_blocks = prev->br_blockcount << 1; + else + alloc_blocks = XFS_B_TO_FSB(mp, offset); if (!alloc_blocks) goto check_writeio; qblocks = alloc_blocks; @@ -550,120 +512,145 @@ xfs_iomap_prealloc_size( */ while (alloc_blocks && alloc_blocks >= freesp) alloc_blocks >>= 4; - check_writeio: if (alloc_blocks < mp->m_writeio_blocks) alloc_blocks = mp->m_writeio_blocks; - trace_xfs_iomap_prealloc_size(ip, alloc_blocks, shift, mp->m_writeio_blocks); - return alloc_blocks; } -int -xfs_iomap_write_delay( - xfs_inode_t *ip, - xfs_off_t offset, - size_t count, - xfs_bmbt_irec_t *ret_imap) +static int +xfs_file_iomap_begin_delay( + struct inode *inode, + loff_t offset, + loff_t count, + unsigned flags, + struct iomap *iomap) { - xfs_mount_t *mp = ip->i_mount; - xfs_fileoff_t offset_fsb; - xfs_fileoff_t last_fsb; - xfs_off_t aligned_offset; - xfs_fileoff_t ioalign; - xfs_extlen_t extsz; - int nimaps; - xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS]; - int prealloc; - int error; - - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - - /* - * Make sure that the dquots are there. This doesn't hold - * the ilock across a disk read. - */ - error = xfs_qm_dqattach_locked(ip, 0); - if (error) - return error; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); + xfs_fileoff_t maxbytes_fsb = + XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); + xfs_fileoff_t end_fsb, orig_end_fsb; + int error = 0, eof = 0; + struct xfs_bmbt_irec got; + struct xfs_bmbt_irec prev; + xfs_extnum_t idx; - extsz = xfs_get_extsz_hint(ip); - offset_fsb = XFS_B_TO_FSBT(mp, offset); + ASSERT(!XFS_IS_REALTIME_INODE(ip)); + ASSERT(!xfs_get_extsz_hint(ip)); - error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count, - imap, XFS_WRITE_IMAPS, &prealloc); - if (error) - return error; + xfs_ilock(ip, XFS_ILOCK_EXCL); -retry: - if (prealloc) { - xfs_fsblock_t alloc_blocks; + if (unlikely(XFS_TEST_ERROR( + (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE), + mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); + error = -EFSCORRUPTED; + goto out_unlock; + } - alloc_blocks = xfs_iomap_prealloc_size(mp, ip, offset, imap, - XFS_WRITE_IMAPS); + XFS_STATS_INC(mp, xs_blk_mapw); - aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1)); - ioalign = XFS_B_TO_FSBT(mp, aligned_offset); - last_fsb = ioalign + alloc_blocks; - } else { - last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); + if (error) + goto out_unlock; } - if (prealloc || extsz) { - error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb); - if (error) - return error; + xfs_bmap_search_extents(ip, offset_fsb, XFS_DATA_FORK, &eof, &idx, + &got, &prev); + if (!eof && got.br_startoff <= offset_fsb) { + trace_xfs_iomap_found(ip, offset, count, 0, &got); + goto done; } + error = xfs_qm_dqattach_locked(ip, 0); + if (error) + goto out_unlock; + /* - * Make sure preallocation does not create extents beyond the range we - * actually support in this filesystem. + * We cap the maximum length we map here to MAX_WRITEBACK_PAGES pages + * to keep the chunks of work done where somewhat symmetric with the + * work writeback does. This is a completely arbitrary number pulled + * out of thin air as a best guess for initial testing. + * + * Note that the values needs to be less than 32-bits wide until + * the lower level functions are updated. */ - if (last_fsb > XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes)) - last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); + count = min_t(loff_t, count, 1024 * PAGE_SIZE); + end_fsb = orig_end_fsb = + min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb); + + if (eof) { + xfs_fsblock_t prealloc_blocks; - ASSERT(last_fsb > offset_fsb); + prealloc_blocks = + xfs_iomap_prealloc_size(ip, offset, count, idx, &prev); + if (prealloc_blocks) { + xfs_extlen_t align; + xfs_off_t end_offset; - nimaps = XFS_WRITE_IMAPS; - error = xfs_bmapi_delay(ip, offset_fsb, last_fsb - offset_fsb, - imap, &nimaps, XFS_BMAPI_ENTIRE); + end_offset = XFS_WRITEIO_ALIGN(mp, offset + count - 1); + end_fsb = XFS_B_TO_FSBT(mp, end_offset) + + prealloc_blocks; + + align = xfs_eof_alignment(ip, 0); + if (align) + end_fsb = roundup_64(end_fsb, align); + + end_fsb = min(end_fsb, maxbytes_fsb); + ASSERT(end_fsb > offset_fsb); + } + } + +retry: + error = xfs_bmapi_reserve_delalloc(ip, offset_fsb, + end_fsb - offset_fsb, &got, + &prev, &idx, eof); switch (error) { case 0: + break; case -ENOSPC: case -EDQUOT: - break; - default: - return error; - } - - /* - * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. Retry - * without EOF preallocation. - */ - if (nimaps == 0) { + /* retry without any preallocation */ trace_xfs_delalloc_enospc(ip, offset, count); - if (prealloc) { - prealloc = 0; - error = 0; + if (end_fsb != orig_end_fsb) { + end_fsb = orig_end_fsb; goto retry; } - return error ? error : -ENOSPC; + /*FALLTHRU*/ + default: + goto out_unlock; } - if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip))) - return xfs_alert_fsblock_zero(ip, &imap[0]); - /* * Tag the inode as speculatively preallocated so we can reclaim this * space on demand, if necessary. */ - if (prealloc) + if (end_fsb != orig_end_fsb) xfs_inode_set_eofblocks_tag(ip); - *ret_imap = imap[0]; - return 0; + trace_xfs_iomap_alloc(ip, offset, count, 0, &got); +done: + if (isnullstartblock(got.br_startblock)) + got.br_startblock = DELAYSTARTBLOCK; + + if (!got.br_startblock) { + error = xfs_alert_fsblock_zero(ip, &got); + if (error) + goto out_unlock; + } + + xfs_bmbt_to_iomap(ip, iomap, &got); + +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; } /* @@ -947,37 +934,13 @@ error_on_bmapi_transaction: return error; } -void -xfs_bmbt_to_iomap( - struct xfs_inode *ip, - struct iomap *iomap, - struct xfs_bmbt_irec *imap) -{ - struct xfs_mount *mp = ip->i_mount; - - if (imap->br_startblock == HOLESTARTBLOCK) { - iomap->blkno = IOMAP_NULL_BLOCK; - iomap->type = IOMAP_HOLE; - } else if (imap->br_startblock == DELAYSTARTBLOCK) { - iomap->blkno = IOMAP_NULL_BLOCK; - iomap->type = IOMAP_DELALLOC; - } else { - iomap->blkno = xfs_fsb_to_db(ip, imap->br_startblock); - if (imap->br_state == XFS_EXT_UNWRITTEN) - iomap->type = IOMAP_UNWRITTEN; - else - iomap->type = IOMAP_MAPPED; - } - iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff); - iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount); - iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip)); -} - -static inline bool imap_needs_alloc(struct xfs_bmbt_irec *imap, int nimaps) +static inline bool imap_needs_alloc(struct inode *inode, + struct xfs_bmbt_irec *imap, int nimaps) { return !nimaps || imap->br_startblock == HOLESTARTBLOCK || - imap->br_startblock == DELAYSTARTBLOCK; + imap->br_startblock == DELAYSTARTBLOCK || + (IS_DAX(inode) && ISUNWRITTEN(imap)); } static int @@ -993,11 +956,18 @@ xfs_file_iomap_begin( struct xfs_bmbt_irec imap; xfs_fileoff_t offset_fsb, end_fsb; int nimaps = 1, error = 0; + unsigned lockmode; if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - xfs_ilock(ip, XFS_ILOCK_EXCL); + if ((flags & IOMAP_WRITE) && + !IS_DAX(inode) && !xfs_get_extsz_hint(ip)) { + return xfs_file_iomap_begin_delay(inode, offset, length, flags, + iomap); + } + + lockmode = xfs_ilock_data_map_shared(ip); ASSERT(offset <= mp->m_super->s_maxbytes); if ((xfs_fsize_t)offset + length > mp->m_super->s_maxbytes) @@ -1008,11 +978,11 @@ xfs_file_iomap_begin( error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, &nimaps, XFS_BMAPI_ENTIRE); if (error) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_iunlock(ip, lockmode); return error; } - if ((flags & IOMAP_WRITE) && imap_needs_alloc(&imap, nimaps)) { + if ((flags & IOMAP_WRITE) && imap_needs_alloc(inode, &imap, nimaps)) { /* * We cap the maximum length we map here to MAX_WRITEBACK_PAGES * pages to keep the chunks of work done where somewhat symmetric @@ -1024,27 +994,23 @@ xfs_file_iomap_begin( * the lower level functions are updated. */ length = min_t(loff_t, length, 1024 * PAGE_SIZE); - if (xfs_get_extsz_hint(ip)) { - /* - * xfs_iomap_write_direct() expects the shared lock. It - * is unlocked on return. - */ - xfs_ilock_demote(ip, XFS_ILOCK_EXCL); - error = xfs_iomap_write_direct(ip, offset, length, &imap, - nimaps); - } else { - error = xfs_iomap_write_delay(ip, offset, length, &imap); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } - + /* + * xfs_iomap_write_direct() expects the shared lock. It + * is unlocked on return. + */ + if (lockmode == XFS_ILOCK_EXCL) + xfs_ilock_demote(ip, lockmode); + error = xfs_iomap_write_direct(ip, offset, length, &imap, + nimaps); if (error) return error; + iomap->flags = IOMAP_F_NEW; trace_xfs_iomap_alloc(ip, offset, length, 0, &imap); } else { ASSERT(nimaps); - xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_iunlock(ip, lockmode); trace_xfs_iomap_found(ip, offset, length, 0, &imap); } diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index fb8aca3..6498be4 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -25,8 +25,6 @@ struct xfs_bmbt_irec; int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t, struct xfs_bmbt_irec *, int); -int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, - struct xfs_bmbt_irec *); int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, struct xfs_bmbt_irec *); int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t); diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 765f084..2b6eec5 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -413,7 +413,8 @@ struct xlog { /* log record crc error injection factor */ uint32_t l_badcrc_factor; #endif - + /* log recovery lsn tracking (for buffer submission */ + xfs_lsn_t l_recovery_lsn; }; #define XLOG_BUF_CANCEL_BUCKET(log, blkno) \ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index e8638fd..846483d 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -44,6 +44,7 @@ #include "xfs_error.h" #include "xfs_dir2.h" #include "xfs_rmap_item.h" +#include "xfs_buf_item.h" #define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1) @@ -381,6 +382,15 @@ xlog_recover_iodone( SHUTDOWN_META_IO_ERROR); } } + + /* + * On v5 supers, a bli could be attached to update the metadata LSN. + * Clean it up. + */ + if (bp->b_fspriv) + xfs_buf_item_relse(bp); + ASSERT(bp->b_fspriv == NULL); + bp->b_iodone = NULL; xfs_buf_ioend(bp); } @@ -2360,12 +2370,14 @@ static void xlog_recover_validate_buf_type( struct xfs_mount *mp, struct xfs_buf *bp, - xfs_buf_log_format_t *buf_f) + xfs_buf_log_format_t *buf_f, + xfs_lsn_t current_lsn) { struct xfs_da_blkinfo *info = bp->b_addr; __uint32_t magic32; __uint16_t magic16; __uint16_t magicda; + char *warnmsg = NULL; /* * We can only do post recovery validation on items on CRC enabled @@ -2404,31 +2416,27 @@ xlog_recover_validate_buf_type( bp->b_ops = &xfs_rmapbt_buf_ops; break; default: - xfs_warn(mp, "Bad btree block magic!"); - ASSERT(0); + warnmsg = "Bad btree block magic!"; break; } break; case XFS_BLFT_AGF_BUF: if (magic32 != XFS_AGF_MAGIC) { - xfs_warn(mp, "Bad AGF block magic!"); - ASSERT(0); + warnmsg = "Bad AGF block magic!"; break; } bp->b_ops = &xfs_agf_buf_ops; break; case XFS_BLFT_AGFL_BUF: if (magic32 != XFS_AGFL_MAGIC) { - xfs_warn(mp, "Bad AGFL block magic!"); - ASSERT(0); + warnmsg = "Bad AGFL block magic!"; break; } bp->b_ops = &xfs_agfl_buf_ops; break; case XFS_BLFT_AGI_BUF: if (magic32 != XFS_AGI_MAGIC) { - xfs_warn(mp, "Bad AGI block magic!"); - ASSERT(0); + warnmsg = "Bad AGI block magic!"; break; } bp->b_ops = &xfs_agi_buf_ops; @@ -2438,8 +2446,7 @@ xlog_recover_validate_buf_type( case XFS_BLFT_GDQUOT_BUF: #ifdef CONFIG_XFS_QUOTA if (magic16 != XFS_DQUOT_MAGIC) { - xfs_warn(mp, "Bad DQUOT block magic!"); - ASSERT(0); + warnmsg = "Bad DQUOT block magic!"; break; } bp->b_ops = &xfs_dquot_buf_ops; @@ -2451,16 +2458,14 @@ xlog_recover_validate_buf_type( break; case XFS_BLFT_DINO_BUF: if (magic16 != XFS_DINODE_MAGIC) { - xfs_warn(mp, "Bad INODE block magic!"); - ASSERT(0); + warnmsg = "Bad INODE block magic!"; break; } bp->b_ops = &xfs_inode_buf_ops; break; case XFS_BLFT_SYMLINK_BUF: if (magic32 != XFS_SYMLINK_MAGIC) { - xfs_warn(mp, "Bad symlink block magic!"); - ASSERT(0); + warnmsg = "Bad symlink block magic!"; break; } bp->b_ops = &xfs_symlink_buf_ops; @@ -2468,8 +2473,7 @@ xlog_recover_validate_buf_type( case XFS_BLFT_DIR_BLOCK_BUF: if (magic32 != XFS_DIR2_BLOCK_MAGIC && magic32 != XFS_DIR3_BLOCK_MAGIC) { - xfs_warn(mp, "Bad dir block magic!"); - ASSERT(0); + warnmsg = "Bad dir block magic!"; break; } bp->b_ops = &xfs_dir3_block_buf_ops; @@ -2477,8 +2481,7 @@ xlog_recover_validate_buf_type( case XFS_BLFT_DIR_DATA_BUF: if (magic32 != XFS_DIR2_DATA_MAGIC && magic32 != XFS_DIR3_DATA_MAGIC) { - xfs_warn(mp, "Bad dir data magic!"); - ASSERT(0); + warnmsg = "Bad dir data magic!"; break; } bp->b_ops = &xfs_dir3_data_buf_ops; @@ -2486,8 +2489,7 @@ xlog_recover_validate_buf_type( case XFS_BLFT_DIR_FREE_BUF: if (magic32 != XFS_DIR2_FREE_MAGIC && magic32 != XFS_DIR3_FREE_MAGIC) { - xfs_warn(mp, "Bad dir3 free magic!"); - ASSERT(0); + warnmsg = "Bad dir3 free magic!"; break; } bp->b_ops = &xfs_dir3_free_buf_ops; @@ -2495,8 +2497,7 @@ xlog_recover_validate_buf_type( case XFS_BLFT_DIR_LEAF1_BUF: if (magicda != XFS_DIR2_LEAF1_MAGIC && magicda != XFS_DIR3_LEAF1_MAGIC) { - xfs_warn(mp, "Bad dir leaf1 magic!"); - ASSERT(0); + warnmsg = "Bad dir leaf1 magic!"; break; } bp->b_ops = &xfs_dir3_leaf1_buf_ops; @@ -2504,8 +2505,7 @@ xlog_recover_validate_buf_type( case XFS_BLFT_DIR_LEAFN_BUF: if (magicda != XFS_DIR2_LEAFN_MAGIC && magicda != XFS_DIR3_LEAFN_MAGIC) { - xfs_warn(mp, "Bad dir leafn magic!"); - ASSERT(0); + warnmsg = "Bad dir leafn magic!"; break; } bp->b_ops = &xfs_dir3_leafn_buf_ops; @@ -2513,8 +2513,7 @@ xlog_recover_validate_buf_type( case XFS_BLFT_DA_NODE_BUF: if (magicda != XFS_DA_NODE_MAGIC && magicda != XFS_DA3_NODE_MAGIC) { - xfs_warn(mp, "Bad da node magic!"); - ASSERT(0); + warnmsg = "Bad da node magic!"; break; } bp->b_ops = &xfs_da3_node_buf_ops; @@ -2522,24 +2521,21 @@ xlog_recover_validate_buf_type( case XFS_BLFT_ATTR_LEAF_BUF: if (magicda != XFS_ATTR_LEAF_MAGIC && magicda != XFS_ATTR3_LEAF_MAGIC) { - xfs_warn(mp, "Bad attr leaf magic!"); - ASSERT(0); + warnmsg = "Bad attr leaf magic!"; break; } bp->b_ops = &xfs_attr3_leaf_buf_ops; break; case XFS_BLFT_ATTR_RMT_BUF: if (magic32 != XFS_ATTR3_RMT_MAGIC) { - xfs_warn(mp, "Bad attr remote magic!"); - ASSERT(0); + warnmsg = "Bad attr remote magic!"; break; } bp->b_ops = &xfs_attr3_rmt_buf_ops; break; case XFS_BLFT_SB_BUF: if (magic32 != XFS_SB_MAGIC) { - xfs_warn(mp, "Bad SB block magic!"); - ASSERT(0); + warnmsg = "Bad SB block magic!"; break; } bp->b_ops = &xfs_sb_buf_ops; @@ -2556,6 +2552,40 @@ xlog_recover_validate_buf_type( xfs_blft_from_flags(buf_f)); break; } + + /* + * Nothing else to do in the case of a NULL current LSN as this means + * the buffer is more recent than the change in the log and will be + * skipped. + */ + if (current_lsn == NULLCOMMITLSN) + return; + + if (warnmsg) { + xfs_warn(mp, warnmsg); + ASSERT(0); + } + + /* + * We must update the metadata LSN of the buffer as it is written out to + * ensure that older transactions never replay over this one and corrupt + * the buffer. This can occur if log recovery is interrupted at some + * point after the current transaction completes, at which point a + * subsequent mount starts recovery from the beginning. + * + * Write verifiers update the metadata LSN from log items attached to + * the buffer. Therefore, initialize a bli purely to carry the LSN to + * the verifier. We'll clean it up in our ->iodone() callback. + */ + if (bp->b_ops) { + struct xfs_buf_log_item *bip; + + ASSERT(!bp->b_iodone || bp->b_iodone == xlog_recover_iodone); + bp->b_iodone = xlog_recover_iodone; + xfs_buf_item_init(bp, mp); + bip = bp->b_fspriv; + bip->bli_item.li_lsn = current_lsn; + } } /* @@ -2569,7 +2599,8 @@ xlog_recover_do_reg_buffer( struct xfs_mount *mp, xlog_recover_item_t *item, struct xfs_buf *bp, - xfs_buf_log_format_t *buf_f) + xfs_buf_log_format_t *buf_f, + xfs_lsn_t current_lsn) { int i; int bit; @@ -2642,7 +2673,7 @@ xlog_recover_do_reg_buffer( /* Shouldn't be any more regions */ ASSERT(i == item->ri_total); - xlog_recover_validate_buf_type(mp, bp, buf_f); + xlog_recover_validate_buf_type(mp, bp, buf_f, current_lsn); } /* @@ -2685,7 +2716,7 @@ xlog_recover_do_dquot_buffer( if (log->l_quotaoffs_flag & type) return false; - xlog_recover_do_reg_buffer(mp, item, bp, buf_f); + xlog_recover_do_reg_buffer(mp, item, bp, buf_f, NULLCOMMITLSN); return true; } @@ -2773,7 +2804,8 @@ xlog_recover_buffer_pass2( */ lsn = xlog_recover_get_buf_lsn(mp, bp); if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { - xlog_recover_validate_buf_type(mp, bp, buf_f); + trace_xfs_log_recover_buf_skip(log, buf_f); + xlog_recover_validate_buf_type(mp, bp, buf_f, NULLCOMMITLSN); goto out_release; } @@ -2789,7 +2821,7 @@ xlog_recover_buffer_pass2( if (!dirty) goto out_release; } else { - xlog_recover_do_reg_buffer(mp, item, bp, buf_f); + xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn); } /* @@ -3846,14 +3878,13 @@ STATIC int xlog_recover_commit_trans( struct xlog *log, struct xlog_recover *trans, - int pass) + int pass, + struct list_head *buffer_list) { int error = 0; - int error2; int items_queued = 0; struct xlog_recover_item *item; struct xlog_recover_item *next; - LIST_HEAD (buffer_list); LIST_HEAD (ra_list); LIST_HEAD (done_list); @@ -3876,7 +3907,7 @@ xlog_recover_commit_trans( items_queued++; if (items_queued >= XLOG_RECOVER_COMMIT_QUEUE_MAX) { error = xlog_recover_items_pass2(log, trans, - &buffer_list, &ra_list); + buffer_list, &ra_list); list_splice_tail_init(&ra_list, &done_list); items_queued = 0; } @@ -3894,15 +3925,14 @@ out: if (!list_empty(&ra_list)) { if (!error) error = xlog_recover_items_pass2(log, trans, - &buffer_list, &ra_list); + buffer_list, &ra_list); list_splice_tail_init(&ra_list, &done_list); } if (!list_empty(&done_list)) list_splice_init(&done_list, &trans->r_itemq); - error2 = xfs_buf_delwri_submit(&buffer_list); - return error ? error : error2; + return error; } STATIC void @@ -4085,7 +4115,8 @@ xlog_recovery_process_trans( char *dp, unsigned int len, unsigned int flags, - int pass) + int pass, + struct list_head *buffer_list) { int error = 0; bool freeit = false; @@ -4109,7 +4140,8 @@ xlog_recovery_process_trans( error = xlog_recover_add_to_cont_trans(log, trans, dp, len); break; case XLOG_COMMIT_TRANS: - error = xlog_recover_commit_trans(log, trans, pass); + error = xlog_recover_commit_trans(log, trans, pass, + buffer_list); /* success or fail, we are now done with this transaction. */ freeit = true; break; @@ -4191,10 +4223,12 @@ xlog_recover_process_ophdr( struct xlog_op_header *ohead, char *dp, char *end, - int pass) + int pass, + struct list_head *buffer_list) { struct xlog_recover *trans; unsigned int len; + int error; /* Do we understand who wrote this op? */ if (ohead->oh_clientid != XFS_TRANSACTION && @@ -4221,8 +4255,39 @@ xlog_recover_process_ophdr( return 0; } + /* + * The recovered buffer queue is drained only once we know that all + * recovery items for the current LSN have been processed. This is + * required because: + * + * - Buffer write submission updates the metadata LSN of the buffer. + * - Log recovery skips items with a metadata LSN >= the current LSN of + * the recovery item. + * - Separate recovery items against the same metadata buffer can share + * a current LSN. I.e., consider that the LSN of a recovery item is + * defined as the starting LSN of the first record in which its + * transaction appears, that a record can hold multiple transactions, + * and/or that a transaction can span multiple records. + * + * In other words, we are allowed to submit a buffer from log recovery + * once per current LSN. Otherwise, we may incorrectly skip recovery + * items and cause corruption. + * + * We don't know up front whether buffers are updated multiple times per + * LSN. Therefore, track the current LSN of each commit log record as it + * is processed and drain the queue when it changes. Use commit records + * because they are ordered correctly by the logging code. + */ + if (log->l_recovery_lsn != trans->r_lsn && + ohead->oh_flags & XLOG_COMMIT_TRANS) { + error = xfs_buf_delwri_submit(buffer_list); + if (error) + return error; + log->l_recovery_lsn = trans->r_lsn; + } + return xlog_recovery_process_trans(log, trans, dp, len, - ohead->oh_flags, pass); + ohead->oh_flags, pass, buffer_list); } /* @@ -4240,7 +4305,8 @@ xlog_recover_process_data( struct hlist_head rhash[], struct xlog_rec_header *rhead, char *dp, - int pass) + int pass, + struct list_head *buffer_list) { struct xlog_op_header *ohead; char *end; @@ -4254,6 +4320,7 @@ xlog_recover_process_data( if (xlog_header_check_recover(log->l_mp, rhead)) return -EIO; + trace_xfs_log_recover_record(log, rhead, pass); while ((dp < end) && num_logops) { ohead = (struct xlog_op_header *)dp; @@ -4262,7 +4329,7 @@ xlog_recover_process_data( /* errors will abort recovery */ error = xlog_recover_process_ophdr(log, rhash, rhead, ohead, - dp, end, pass); + dp, end, pass, buffer_list); if (error) return error; @@ -4685,7 +4752,8 @@ xlog_recover_process( struct hlist_head rhash[], struct xlog_rec_header *rhead, char *dp, - int pass) + int pass, + struct list_head *buffer_list) { int error; __le32 crc; @@ -4732,7 +4800,8 @@ xlog_recover_process( if (error) return error; - return xlog_recover_process_data(log, rhash, rhead, dp, pass); + return xlog_recover_process_data(log, rhash, rhead, dp, pass, + buffer_list); } STATIC int @@ -4793,9 +4862,11 @@ xlog_do_recovery_pass( char *offset; xfs_buf_t *hbp, *dbp; int error = 0, h_size, h_len; + int error2 = 0; int bblks, split_bblks; int hblks, split_hblks, wrapped_hblks; struct hlist_head rhash[XLOG_RHASH_SIZE]; + LIST_HEAD (buffer_list); ASSERT(head_blk != tail_blk); rhead_blk = 0; @@ -4981,7 +5052,7 @@ xlog_do_recovery_pass( } error = xlog_recover_process(log, rhash, rhead, offset, - pass); + pass, &buffer_list); if (error) goto bread_err2; @@ -5012,7 +5083,8 @@ xlog_do_recovery_pass( if (error) goto bread_err2; - error = xlog_recover_process(log, rhash, rhead, offset, pass); + error = xlog_recover_process(log, rhash, rhead, offset, pass, + &buffer_list); if (error) goto bread_err2; @@ -5025,10 +5097,17 @@ xlog_do_recovery_pass( bread_err1: xlog_put_bp(hbp); + /* + * Submit buffers that have been added from the last record processed, + * regardless of error status. + */ + if (!list_empty(&buffer_list)) + error2 = xfs_buf_delwri_submit(&buffer_list); + if (error && first_bad) *first_bad = rhead_blk; - return error; + return error ? error : error2; } /* diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index faeead6..56e85a6 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -934,6 +934,20 @@ xfs_mountfs( } /* + * Now the log is fully replayed, we can transition to full read-only + * mode for read-only mounts. This will sync all the metadata and clean + * the log so that the recovery we just performed does not have to be + * replayed again on the next mount. + * + * We use the same quiesce mechanism as the rw->ro remount, as they are + * semantically identical operations. + */ + if ((mp->m_flags & (XFS_MOUNT_RDONLY|XFS_MOUNT_NORECOVERY)) == + XFS_MOUNT_RDONLY) { + xfs_quiesce_attr(mp); + } + + /* * Complete the quota initialisation, post-log-replay component. */ if (quotamount) { diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index b36676c..041d949 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -57,10 +57,16 @@ enum { #define XFS_ERR_RETRY_FOREVER -1 +/* + * Although retry_timeout is in jiffies which is normally an unsigned long, + * we limit the retry timeout to 86400 seconds, or one day. So even a + * signed 32-bit long is sufficient for a HZ value up to 24855. Making it + * signed lets us store the special "-1" value, meaning retry forever. + */ struct xfs_error_cfg { struct xfs_kobj kobj; int max_retries; - unsigned long retry_timeout; /* in jiffies, 0 = no timeout */ + long retry_timeout; /* in jiffies, -1 = infinite */ }; typedef struct xfs_mount { @@ -325,6 +331,22 @@ xfs_mp_fail_writes(struct xfs_mount *mp) } #endif +/* per-AG block reservation data structures*/ +enum xfs_ag_resv_type { + XFS_AG_RESV_NONE = 0, + XFS_AG_RESV_METADATA, + XFS_AG_RESV_AGFL, +}; + +struct xfs_ag_resv { + /* number of blocks originally reserved here */ + xfs_extlen_t ar_orig_reserved; + /* number of blocks reserved here */ + xfs_extlen_t ar_reserved; + /* number of blocks originally asked for */ + xfs_extlen_t ar_asked; +}; + /* * Per-ag incore structure, copies of information in agf and agi, to improve the * performance of allocation group selection. @@ -372,8 +394,28 @@ typedef struct xfs_perag { /* for rcu-safe freeing */ struct rcu_head rcu_head; int pagb_count; /* pagb slots in use */ + + /* Blocks reserved for all kinds of metadata. */ + struct xfs_ag_resv pag_meta_resv; + /* Blocks reserved for just AGFL-based metadata. */ + struct xfs_ag_resv pag_agfl_resv; } xfs_perag_t; +static inline struct xfs_ag_resv * +xfs_perag_resv( + struct xfs_perag *pag, + enum xfs_ag_resv_type type) +{ + switch (type) { + case XFS_AG_RESV_METADATA: + return &pag->pag_meta_resv; + case XFS_AG_RESV_AGFL: + return &pag->pag_agfl_resv; + default: + return NULL; + } +} + extern void xfs_uuid_table_free(void); extern int xfs_log_sbcount(xfs_mount_t *); extern __uint64_t xfs_default_resblks(xfs_mount_t *mp); diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 2500f28..0432a45 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -51,28 +51,16 @@ xfs_rui_item_free( kmem_zone_free(xfs_rui_zone, ruip); } -/* - * This returns the number of iovecs needed to log the given rui item. - * We only need 1 iovec for an rui item. It just logs the rui_log_format - * structure. - */ -static inline int -xfs_rui_item_sizeof( - struct xfs_rui_log_item *ruip) -{ - return sizeof(struct xfs_rui_log_format) + - (ruip->rui_format.rui_nextents - 1) * - sizeof(struct xfs_map_extent); -} - STATIC void xfs_rui_item_size( struct xfs_log_item *lip, int *nvecs, int *nbytes) { + struct xfs_rui_log_item *ruip = RUI_ITEM(lip); + *nvecs += 1; - *nbytes += xfs_rui_item_sizeof(RUI_ITEM(lip)); + *nbytes += xfs_rui_log_format_sizeof(ruip->rui_format.rui_nextents); } /* @@ -97,7 +85,7 @@ xfs_rui_item_format( ruip->rui_format.rui_size = 1; xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_RUI_FORMAT, &ruip->rui_format, - xfs_rui_item_sizeof(ruip)); + xfs_rui_log_format_sizeof(ruip->rui_format.rui_nextents)); } /* @@ -205,16 +193,12 @@ xfs_rui_init( { struct xfs_rui_log_item *ruip; - uint size; ASSERT(nextents > 0); - if (nextents > XFS_RUI_MAX_FAST_EXTENTS) { - size = (uint)(sizeof(struct xfs_rui_log_item) + - ((nextents - 1) * sizeof(struct xfs_map_extent))); - ruip = kmem_zalloc(size, KM_SLEEP); - } else { + if (nextents > XFS_RUI_MAX_FAST_EXTENTS) + ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), KM_SLEEP); + else ruip = kmem_zone_zalloc(xfs_rui_zone, KM_SLEEP); - } xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops); ruip->rui_format.rui_nextents = nextents; @@ -239,14 +223,12 @@ xfs_rui_copy_format( uint len; src_rui_fmt = buf->i_addr; - len = sizeof(struct xfs_rui_log_format) + - (src_rui_fmt->rui_nextents - 1) * - sizeof(struct xfs_map_extent); + len = xfs_rui_log_format_sizeof(src_rui_fmt->rui_nextents); if (buf->i_len != len) return -EFSCORRUPTED; - memcpy((char *)dst_rui_fmt, (char *)src_rui_fmt, len); + memcpy(dst_rui_fmt, src_rui_fmt, len); return 0; } diff --git a/fs/xfs/xfs_rmap_item.h b/fs/xfs/xfs_rmap_item.h index aefcc3a..340c968 100644 --- a/fs/xfs/xfs_rmap_item.h +++ b/fs/xfs/xfs_rmap_item.h @@ -70,6 +70,14 @@ struct xfs_rui_log_item { struct xfs_rui_log_format rui_format; }; +static inline size_t +xfs_rui_log_item_sizeof( + unsigned int nr) +{ + return offsetof(struct xfs_rui_log_item, rui_format) + + xfs_rui_log_format_sizeof(nr); +} + /* * This is the "rmap update done" log item. It is used to log the fact that * some rmapbt updates mentioned in an earlier rui item have been performed. diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index fd6be45..2d092f9 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1137,7 +1137,7 @@ xfs_restore_resvblks(struct xfs_mount *mp) * Note: xfs_log_quiesce() stops background log work - the callers must ensure * it is started again when appropriate. */ -static void +void xfs_quiesce_attr( struct xfs_mount *mp) { @@ -1782,9 +1782,8 @@ xfs_init_zones(void) if (!xfs_rud_zone) goto out_destroy_icreate_zone; - xfs_rui_zone = kmem_zone_init((sizeof(struct xfs_rui_log_item) + - ((XFS_RUI_MAX_FAST_EXTENTS - 1) * - sizeof(struct xfs_map_extent))), + xfs_rui_zone = kmem_zone_init( + xfs_rui_log_item_sizeof(XFS_RUI_MAX_FAST_EXTENTS), "xfs_rui_item"); if (!xfs_rui_zone) goto out_destroy_rud_zone; diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h index 529bce9..b6418ab 100644 --- a/fs/xfs/xfs_super.h +++ b/fs/xfs/xfs_super.h @@ -61,6 +61,7 @@ struct xfs_mount; struct xfs_buftarg; struct block_device; +extern void xfs_quiesce_attr(struct xfs_mount *mp); extern void xfs_flush_inodes(struct xfs_mount *mp); extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *, diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index 79cfd3f..5f8d55d 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -393,9 +393,15 @@ max_retries_show( struct kobject *kobject, char *buf) { + int retries; struct xfs_error_cfg *cfg = to_error_cfg(kobject); - return snprintf(buf, PAGE_SIZE, "%d\n", cfg->max_retries); + if (cfg->retry_timeout == XFS_ERR_RETRY_FOREVER) + retries = -1; + else + retries = cfg->max_retries; + + return snprintf(buf, PAGE_SIZE, "%d\n", retries); } static ssize_t @@ -415,7 +421,10 @@ max_retries_store( if (val < -1) return -EINVAL; - cfg->max_retries = val; + if (val == -1) + cfg->retry_timeout = XFS_ERR_RETRY_FOREVER; + else + cfg->max_retries = val; return count; } XFS_SYSFS_ATTR_RW(max_retries); @@ -425,10 +434,15 @@ retry_timeout_seconds_show( struct kobject *kobject, char *buf) { + int timeout; struct xfs_error_cfg *cfg = to_error_cfg(kobject); - return snprintf(buf, PAGE_SIZE, "%ld\n", - jiffies_to_msecs(cfg->retry_timeout) / MSEC_PER_SEC); + if (cfg->retry_timeout == XFS_ERR_RETRY_FOREVER) + timeout = -1; + else + timeout = jiffies_to_msecs(cfg->retry_timeout) / MSEC_PER_SEC; + + return snprintf(buf, PAGE_SIZE, "%d\n", timeout); } static ssize_t @@ -445,11 +459,16 @@ retry_timeout_seconds_store( if (ret) return ret; - /* 1 day timeout maximum */ - if (val < 0 || val > 86400) + /* 1 day timeout maximum, -1 means infinite */ + if (val < -1 || val > 86400) return -EINVAL; - cfg->retry_timeout = msecs_to_jiffies(val * MSEC_PER_SEC); + if (val == -1) + cfg->retry_timeout = XFS_ERR_RETRY_FOREVER; + else { + cfg->retry_timeout = msecs_to_jiffies(val * MSEC_PER_SEC); + ASSERT(msecs_to_jiffies(val * MSEC_PER_SEC) < LONG_MAX); + } return count; } XFS_SYSFS_ATTR_RW(retry_timeout_seconds); @@ -519,18 +538,19 @@ struct xfs_error_init { static const struct xfs_error_init xfs_error_meta_init[XFS_ERR_ERRNO_MAX] = { { .name = "default", .max_retries = XFS_ERR_RETRY_FOREVER, - .retry_timeout = 0, + .retry_timeout = XFS_ERR_RETRY_FOREVER, }, { .name = "EIO", .max_retries = XFS_ERR_RETRY_FOREVER, - .retry_timeout = 0, + .retry_timeout = XFS_ERR_RETRY_FOREVER, }, { .name = "ENOSPC", .max_retries = XFS_ERR_RETRY_FOREVER, - .retry_timeout = 0, + .retry_timeout = XFS_ERR_RETRY_FOREVER, }, { .name = "ENODEV", - .max_retries = 0, + .max_retries = 0, /* We can't recover from devices disappearing */ + .retry_timeout = 0, }, }; @@ -561,7 +581,10 @@ xfs_error_sysfs_init_class( goto out_error; cfg->max_retries = init[i].max_retries; - cfg->retry_timeout = msecs_to_jiffies( + if (init[i].retry_timeout == XFS_ERR_RETRY_FOREVER) + cfg->retry_timeout = XFS_ERR_RETRY_FOREVER; + else + cfg->retry_timeout = msecs_to_jiffies( init[i].retry_timeout * MSEC_PER_SEC); } return 0; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index d303a66..c6b2b1d 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1570,14 +1570,15 @@ TRACE_EVENT(xfs_agf, TRACE_EVENT(xfs_free_extent, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, - xfs_extlen_t len, bool isfl, int haveleft, int haveright), - TP_ARGS(mp, agno, agbno, len, isfl, haveleft, haveright), + xfs_extlen_t len, enum xfs_ag_resv_type resv, int haveleft, + int haveright), + TP_ARGS(mp, agno, agbno, len, resv, haveleft, haveright), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, agbno) __field(xfs_extlen_t, len) - __field(int, isfl) + __field(int, resv) __field(int, haveleft) __field(int, haveright) ), @@ -1586,16 +1587,16 @@ TRACE_EVENT(xfs_free_extent, __entry->agno = agno; __entry->agbno = agbno; __entry->len = len; - __entry->isfl = isfl; + __entry->resv = resv; __entry->haveleft = haveleft; __entry->haveright = haveright; ), - TP_printk("dev %d:%d agno %u agbno %u len %u isfl %d %s", + TP_printk("dev %d:%d agno %u agbno %u len %u resv %d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->agbno, __entry->len, - __entry->isfl, + __entry->resv, __entry->haveleft ? (__entry->haveright ? "both" : "left") : (__entry->haveright ? "right" : "none")) @@ -1622,8 +1623,8 @@ DECLARE_EVENT_CLASS(xfs_alloc_class, __field(short, otype) __field(char, wasdel) __field(char, wasfromfl) - __field(char, isfl) - __field(char, userdata) + __field(int, resv) + __field(int, datatype) __field(xfs_fsblock_t, firstblock) ), TP_fast_assign( @@ -1643,14 +1644,14 @@ DECLARE_EVENT_CLASS(xfs_alloc_class, __entry->otype = args->otype; __entry->wasdel = args->wasdel; __entry->wasfromfl = args->wasfromfl; - __entry->isfl = args->isfl; - __entry->userdata = args->userdata; + __entry->resv = args->resv; + __entry->datatype = args->datatype; __entry->firstblock = args->firstblock; ), TP_printk("dev %d:%d agno %u agbno %u minlen %u maxlen %u mod %u " "prod %u minleft %u total %u alignment %u minalignslop %u " - "len %u type %s otype %s wasdel %d wasfromfl %d isfl %d " - "userdata %d firstblock 0x%llx", + "len %u type %s otype %s wasdel %d wasfromfl %d resv %d " + "datatype 0x%x firstblock 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->agbno, @@ -1667,8 +1668,8 @@ DECLARE_EVENT_CLASS(xfs_alloc_class, __print_symbolic(__entry->otype, XFS_ALLOC_TYPES), __entry->wasdel, __entry->wasfromfl, - __entry->isfl, - __entry->userdata, + __entry->resv, + __entry->datatype, (unsigned long long)__entry->firstblock) ) @@ -1984,6 +1985,29 @@ DEFINE_EVENT(xfs_swap_extent_class, name, \ DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before); DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after); +TRACE_EVENT(xfs_log_recover_record, + TP_PROTO(struct xlog *log, struct xlog_rec_header *rhead, int pass), + TP_ARGS(log, rhead, pass), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_lsn_t, lsn) + __field(int, len) + __field(int, num_logops) + __field(int, pass) + ), + TP_fast_assign( + __entry->dev = log->l_mp->m_super->s_dev; + __entry->lsn = be64_to_cpu(rhead->h_lsn); + __entry->len = be32_to_cpu(rhead->h_len); + __entry->num_logops = be32_to_cpu(rhead->h_num_logops); + __entry->pass = pass; + ), + TP_printk("dev %d:%d lsn 0x%llx len 0x%x num_logops 0x%x pass %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->lsn, __entry->len, __entry->num_logops, + __entry->pass) +) + DECLARE_EVENT_CLASS(xfs_log_recover_item_class, TP_PROTO(struct xlog *log, struct xlog_recover *trans, struct xlog_recover_item *item, int pass), @@ -1992,6 +2016,7 @@ DECLARE_EVENT_CLASS(xfs_log_recover_item_class, __field(dev_t, dev) __field(unsigned long, item) __field(xlog_tid_t, tid) + __field(xfs_lsn_t, lsn) __field(int, type) __field(int, pass) __field(int, count) @@ -2001,15 +2026,17 @@ DECLARE_EVENT_CLASS(xfs_log_recover_item_class, __entry->dev = log->l_mp->m_super->s_dev; __entry->item = (unsigned long)item; __entry->tid = trans->r_log_tid; + __entry->lsn = trans->r_lsn; __entry->type = ITEM_TYPE(item); __entry->pass = pass; __entry->count = item->ri_cnt; __entry->total = item->ri_total; ), - TP_printk("dev %d:%d trans 0x%x, pass %d, item 0x%p, item type %s " - "item region count/total %d/%d", + TP_printk("dev %d:%d tid 0x%x lsn 0x%llx, pass %d, item 0x%p, " + "item type %s item region count/total %d/%d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid, + __entry->lsn, __entry->pass, (void *)__entry->item, __print_symbolic(__entry->type, XFS_LI_TYPE_DESC), @@ -2068,6 +2095,7 @@ DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel); DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_add); DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_ref_inc); DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_recover); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_skip); DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_inode_buf); DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_reg_buf); DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_dquot_buf); @@ -2558,6 +2586,60 @@ DEFINE_RMAPBT_EVENT(xfs_rmap_lookup_le_range_result); DEFINE_RMAPBT_EVENT(xfs_rmap_find_right_neighbor_result); DEFINE_RMAPBT_EVENT(xfs_rmap_find_left_neighbor_result); +/* per-AG reservation */ +DECLARE_EVENT_CLASS(xfs_ag_resv_class, + TP_PROTO(struct xfs_perag *pag, enum xfs_ag_resv_type resv, + xfs_extlen_t len), + TP_ARGS(pag, resv, len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(int, resv) + __field(xfs_extlen_t, freeblks) + __field(xfs_extlen_t, flcount) + __field(xfs_extlen_t, reserved) + __field(xfs_extlen_t, asked) + __field(xfs_extlen_t, len) + ), + TP_fast_assign( + struct xfs_ag_resv *r = xfs_perag_resv(pag, resv); + + __entry->dev = pag->pag_mount->m_super->s_dev; + __entry->agno = pag->pag_agno; + __entry->resv = resv; + __entry->freeblks = pag->pagf_freeblks; + __entry->flcount = pag->pagf_flcount; + __entry->reserved = r ? r->ar_reserved : 0; + __entry->asked = r ? r->ar_asked : 0; + __entry->len = len; + ), + TP_printk("dev %d:%d agno %u resv %d freeblks %u flcount %u resv %u ask %u len %u\n", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->resv, + __entry->freeblks, + __entry->flcount, + __entry->reserved, + __entry->asked, + __entry->len) +) +#define DEFINE_AG_RESV_EVENT(name) \ +DEFINE_EVENT(xfs_ag_resv_class, name, \ + TP_PROTO(struct xfs_perag *pag, enum xfs_ag_resv_type type, \ + xfs_extlen_t len), \ + TP_ARGS(pag, type, len)) + +/* per-AG reservation tracepoints */ +DEFINE_AG_RESV_EVENT(xfs_ag_resv_init); +DEFINE_AG_RESV_EVENT(xfs_ag_resv_free); +DEFINE_AG_RESV_EVENT(xfs_ag_resv_alloc_extent); +DEFINE_AG_RESV_EVENT(xfs_ag_resv_free_extent); +DEFINE_AG_RESV_EVENT(xfs_ag_resv_critical); +DEFINE_AG_RESV_EVENT(xfs_ag_resv_needed); + +DEFINE_AG_ERROR_EVENT(xfs_ag_resv_free_error); +DEFINE_AG_ERROR_EVENT(xfs_ag_resv_init_error); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 5f3d33d..70f42ea 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -217,7 +217,7 @@ undo_log: undo_blocks: if (blocks > 0) { - xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd); + xfs_mod_fdblocks(tp->t_mountp, (int64_t)blocks, rsvd); tp->t_blk_res = 0; } @@ -318,7 +318,6 @@ xfs_trans_mod_sb( * in-core superblock's counter. This should only * be applied to the on-disk superblock. */ - ASSERT(delta < 0); tp->t_res_fdblocks_delta += delta; if (xfs_sb_version_haslazysbcount(&mp->m_sb)) flags &= ~XFS_TRANS_SB_DIRTY; diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c index 459ddec..ab43864 100644 --- a/fs/xfs/xfs_trans_extfree.c +++ b/fs/xfs/xfs_trans_extfree.c @@ -79,7 +79,8 @@ xfs_trans_free_extent( trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno, ext_len); - error = xfs_free_extent(tp, start_block, ext_len, oinfo); + error = xfs_free_extent(tp, start_block, ext_len, oinfo, + XFS_AG_RESV_NONE); /* * Mark the transaction dirty, even on error. This ensures the diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index ea62245..6290093 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -147,6 +147,7 @@ __xfs_xattr_put_listent( arraytop = context->count + prefix_len + namelen + 1; if (arraytop > context->firstu) { context->count = -1; /* insufficient space */ + context->seen_enough = 1; return 0; } offset = (char *)context->alist + context->count; diff --git a/include/linux/dax.h b/include/linux/dax.h index 9c6dc77..add6c4b 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -6,13 +6,19 @@ #include <linux/radix-tree.h> #include <asm/pgtable.h> +struct iomap_ops; + /* We use lowest available exceptional entry bit for locking */ #define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT) +ssize_t iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter, + struct iomap_ops *ops); ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, get_block_t, dio_iodone_t, int flags); int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t); int dax_truncate_page(struct inode *, loff_t from, get_block_t); +int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, + struct iomap_ops *ops); int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t); int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); void dax_wake_mapping_entry_waiter(struct address_space *mapping, diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 3d70ece..e63e288 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -22,6 +22,8 @@ struct vm_fault; * Flags for iomap mappings: */ #define IOMAP_F_MERGED 0x01 /* contains multiple blocks/extents */ +#define IOMAP_F_SHARED 0x02 /* block shared with another file */ +#define IOMAP_F_NEW 0x04 /* blocks have been newly allocated */ /* * Magic value for blkno: @@ -64,6 +66,8 @@ struct iomap_ops { ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from, struct iomap_ops *ops); +int iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len, + struct iomap_ops *ops); int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, struct iomap_ops *ops); int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, diff --git a/mm/filemap.c b/mm/filemap.c index 4bad32d..68f1813 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1923,16 +1923,18 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) if (iocb->ki_flags & IOCB_DIRECT) { struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; + struct iov_iter data = *iter; loff_t size; size = i_size_read(inode); retval = filemap_write_and_wait_range(mapping, iocb->ki_pos, iocb->ki_pos + count - 1); - if (!retval) { - struct iov_iter data = *iter; - retval = mapping->a_ops->direct_IO(iocb, &data); - } + if (retval < 0) + goto out; + file_accessed(file); + + retval = mapping->a_ops->direct_IO(iocb, &data); if (retval > 0) { iocb->ki_pos += retval; iov_iter_advance(iter, retval); @@ -1948,10 +1950,8 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) * DAX files, so don't bother trying. */ if (retval < 0 || !iov_iter_count(iter) || iocb->ki_pos >= size || - IS_DAX(inode)) { - file_accessed(file); + IS_DAX(inode)) goto out; - } } retval = do_generic_file_read(file, &iocb->ki_pos, iter, retval); |