From 25b68a8f0ab13a98de02650208ec927796659898 Mon Sep 17 00:00:00 2001 From: Stephen Smalley Date: Fri, 17 Feb 2017 10:13:59 -0500 Subject: timerfd: Only check CAP_WAKE_ALARM when it is needed timerfd_create() and do_timerfd_settime() evaluate capable(CAP_WAKE_ALARM) unconditionally although CAP_WAKE_ALARM is only required for CLOCK_REALTIME_ALARM and CLOCK_BOOTTIME_ALARM. This can cause extraneous audit messages when using a LSM such as SELinux, incorrectly causes PF_SUPERPRIV to be set even when no privilege was exercised, and is inefficient. Flip the order of the tests in both functions so that we only call capable() if the capability is truly required for the operation. Signed-off-by: Stephen Smalley Cc: linux-security-module@vger.kernel.org Cc: selinux@tycho.nsa.gov Link: http://lkml.kernel.org/r/1487344439-22293-1-git-send-email-sds@tycho.nsa.gov Signed-off-by: Thomas Gleixner --- fs/timerfd.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/timerfd.c b/fs/timerfd.c index 384fa75..c543cdb 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -400,9 +400,9 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) clockid != CLOCK_BOOTTIME_ALARM)) return -EINVAL; - if (!capable(CAP_WAKE_ALARM) && - (clockid == CLOCK_REALTIME_ALARM || - clockid == CLOCK_BOOTTIME_ALARM)) + if ((clockid == CLOCK_REALTIME_ALARM || + clockid == CLOCK_BOOTTIME_ALARM) && + !capable(CAP_WAKE_ALARM)) return -EPERM; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); @@ -449,7 +449,7 @@ static int do_timerfd_settime(int ufd, int flags, return ret; ctx = f.file->private_data; - if (!capable(CAP_WAKE_ALARM) && isalarm(ctx)) { + if (isalarm(ctx) && !capable(CAP_WAKE_ALARM)) { fdput(f); return -EPERM; } -- cgit v1.1 From c771c14baa3319c85512e575671bf0bb18824c11 Mon Sep 17 00:00:00 2001 From: Eryu Guan Date: Thu, 2 Mar 2017 15:02:06 -0800 Subject: iomap: invalidate page caches should be after iomap_dio_complete() in direct write After XFS switching to iomap based DIO (commit acdda3aae146 ("xfs: use iomap_dio_rw")), I started to notice dio29/dio30 tests failures from LTP run on ppc64 hosts, and they can be reproduced on x86_64 hosts with 512B/1k block size XFS too. dio29 diotest3 -b 65536 -n 100 -i 1000 -o 1024000 dio30 diotest6 -b 65536 -n 100 -i 1000 -o 1024000 The failure message is like: bufcmp: offset 0: Expected: 0x62, got 0x0 diotest03 1 TPASS : Read with Direct IO, Write without diotest03 2 TFAIL : diotest3.c:142: comparsion failed; child=98 offset=1425408 diotest03 3 TFAIL : diotest3.c:194: Write Direct-child 98 failed Direct write wrote 0x62 but buffer read got zero. This is because, when doing direct write to a hole or preallocated file, we invalidate the page caches before converting the extent from unwritten state to normal state, which is done by iomap_dio_complete(), thus leave a window for other buffer reader to cache the unwritten state extent. Consider this case, with sub-page blocksize XFS, two processes are direct writing to different blocksize-aligned regions (say 512B) of the same preallocated file, and reading the region back via buffered I/O to compare contents. process A, region [0,512] process B, region [512,1024] xfs_file_write_iter xfs_file_aio_dio_write iomap_dio_rw iomap_apply invalidate_inode_pages2_range xfs_file_write_iter xfs_file_aio_dio_write iomap_dio_rw iomap_apply invalidate_inode_pages2_range iomap_dio_complete xfs_file_read_iter xfs_file_buffered_aio_read generic_file_read_iter do_generic_file_read iomap_dio_complete xfs_file_read_iter Process A first invalidates page caches, at this point the underlying extent is still in unwritten state (iomap_dio_complete not called yet), and process B finishs direct write and populates page caches via readahead, which caches zeros in page for region A, then process A reads zeros from page cache, instead of the actual data. Fix it by invalidating page caches after converting unwritten extent to make sure we read content from disk after extent state changed, as what we did before switching to iomap based dio. Also introduce a new 'start' variable to save the original write offset (iomap_dio_complete() updates iocb->ki_pos), and a 'err' variable for invalidating caches result, cause we can't reuse 'ret' anymore. Signed-off-by: Eryu Guan Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/iomap.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/iomap.c b/fs/iomap.c index 3ca1a8e..141c3cd 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -846,7 +846,8 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = file_inode(iocb->ki_filp); size_t count = iov_iter_count(iter); - loff_t pos = iocb->ki_pos, end = iocb->ki_pos + count - 1, ret = 0; + loff_t pos = iocb->ki_pos, start = pos; + loff_t end = iocb->ki_pos + count - 1, ret = 0; unsigned int flags = IOMAP_DIRECT; struct blk_plug plug; struct iomap_dio *dio; @@ -887,12 +888,12 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, } if (mapping->nrpages) { - ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end); + ret = filemap_write_and_wait_range(mapping, start, end); if (ret) goto out_free_dio; ret = invalidate_inode_pages2_range(mapping, - iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT); + start >> PAGE_SHIFT, end >> PAGE_SHIFT); WARN_ON_ONCE(ret); ret = 0; } @@ -941,6 +942,8 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, __set_current_state(TASK_RUNNING); } + ret = iomap_dio_complete(dio); + /* * Try again to invalidate clean pages which might have been cached by * non-direct readahead, or faulted in by get_user_pages() if the source @@ -949,12 +952,12 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, * this invalidation fails, tough, the write still worked... */ if (iov_iter_rw(iter) == WRITE && mapping->nrpages) { - ret = invalidate_inode_pages2_range(mapping, - iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT); - WARN_ON_ONCE(ret); + int err = invalidate_inode_pages2_range(mapping, + start >> PAGE_SHIFT, end >> PAGE_SHIFT); + WARN_ON_ONCE(err); } - return iomap_dio_complete(dio); + return ret; out_free_dio: kfree(dio); -- cgit v1.1 From 3802a345321a08093ba2ddb1849e736f84e8d450 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 7 Mar 2017 16:45:58 -0800 Subject: xfs: only reclaim unwritten COW extents periodically We only want to reclaim preallocations from our periodic work item. Currently this is archived by looking for a dirty inode, but that check is rather fragile. Instead add a flag to xfs_reflink_cancel_cow_* so that the caller can ask for just cancelling unwritten extents in the COW fork. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong [darrick: fix typos in commit message] Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_aops.c | 2 +- fs/xfs/xfs_icache.c | 2 +- fs/xfs/xfs_inode.c | 2 +- fs/xfs/xfs_reflink.c | 23 ++++++++++++++++------- fs/xfs/xfs_reflink.h | 4 ++-- fs/xfs/xfs_super.c | 2 +- 6 files changed, 22 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index bf65a9e..aa8a6f0 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -293,7 +293,7 @@ xfs_end_io( goto done; if (ioend->io_bio->bi_error) { error = xfs_reflink_cancel_cow_range(ip, - ioend->io_offset, ioend->io_size); + ioend->io_offset, ioend->io_size, true); goto done; } error = xfs_reflink_end_cow(ip, ioend->io_offset, diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 7234b97..3531f8f 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1608,7 +1608,7 @@ xfs_inode_free_cowblocks( xfs_ilock(ip, XFS_IOLOCK_EXCL); xfs_ilock(ip, XFS_MMAPLOCK_EXCL); - ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF); + ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false); xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); xfs_iunlock(ip, XFS_IOLOCK_EXCL); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index edfa6a5..7eaf1ef 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1615,7 +1615,7 @@ xfs_itruncate_extents( /* Remove all pending CoW reservations. */ error = xfs_reflink_cancel_cow_blocks(ip, &tp, first_unmap_block, - last_block); + last_block, true); if (error) goto out; diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index da6d08f..4a84c5e 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -548,14 +548,18 @@ xfs_reflink_trim_irec_to_next_cow( } /* - * Cancel all pending CoW reservations for some block range of an inode. + * Cancel CoW reservations for some block range of an inode. + * + * If cancel_real is true this function cancels all COW fork extents for the + * inode; if cancel_real is false, real extents are not cleared. */ int xfs_reflink_cancel_cow_blocks( struct xfs_inode *ip, struct xfs_trans **tpp, xfs_fileoff_t offset_fsb, - xfs_fileoff_t end_fsb) + xfs_fileoff_t end_fsb, + bool cancel_real) { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); struct xfs_bmbt_irec got, del; @@ -579,7 +583,7 @@ xfs_reflink_cancel_cow_blocks( &idx, &got, &del); if (error) break; - } else { + } else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) { xfs_trans_ijoin(*tpp, ip, 0); xfs_defer_init(&dfops, &firstfsb); @@ -621,13 +625,17 @@ xfs_reflink_cancel_cow_blocks( } /* - * Cancel all pending CoW reservations for some byte range of an inode. + * Cancel CoW reservations for some byte range of an inode. + * + * If cancel_real is true this function cancels all COW fork extents for the + * inode; if cancel_real is false, real extents are not cleared. */ int xfs_reflink_cancel_cow_range( struct xfs_inode *ip, xfs_off_t offset, - xfs_off_t count) + xfs_off_t count, + bool cancel_real) { struct xfs_trans *tp; xfs_fileoff_t offset_fsb; @@ -653,7 +661,8 @@ xfs_reflink_cancel_cow_range( xfs_trans_ijoin(tp, ip, 0); /* Scrape out the old CoW reservations */ - error = xfs_reflink_cancel_cow_blocks(ip, &tp, offset_fsb, end_fsb); + error = xfs_reflink_cancel_cow_blocks(ip, &tp, offset_fsb, end_fsb, + cancel_real); if (error) goto out_cancel; @@ -1450,7 +1459,7 @@ next: * We didn't find any shared blocks so turn off the reflink flag. * First, get rid of any leftover CoW mappings. */ - error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF); + error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF, true); if (error) return error; diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index 33ac9b8..d29a796 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -39,9 +39,9 @@ extern void xfs_reflink_trim_irec_to_next_cow(struct xfs_inode *ip, extern int xfs_reflink_cancel_cow_blocks(struct xfs_inode *ip, struct xfs_trans **tpp, xfs_fileoff_t offset_fsb, - xfs_fileoff_t end_fsb); + xfs_fileoff_t end_fsb, bool cancel_real); extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset, - xfs_off_t count); + xfs_off_t count, bool cancel_real); extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count); extern int xfs_reflink_recover_cow(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 890862f..685c042 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -953,7 +953,7 @@ xfs_fs_destroy_inode( XFS_STATS_INC(ip->i_mount, vn_remove); if (xfs_is_reflink_inode(ip)) { - error = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF); + error = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, true); if (error && !XFS_FORCED_SHUTDOWN(ip->i_mount)) xfs_warn(ip->i_mount, "Error %d while evicting CoW blocks for inode %llu.", -- cgit v1.1 From 787eb485509f9d58962bd8b4dbc6a5ac6e2034fe Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 2 Mar 2017 15:02:51 -0800 Subject: xfs: fix and streamline error handling in xfs_end_io There are two different cases of buffered I/O errors: - first we can have an already shutdown fs. In that case we should skip any on-disk operations and just clean up the appen transaction if present and destroy the ioend - a real I/O error. In that case we should cleanup any lingering COW blocks. This gets skipped in the current code and is fixed by this patch. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_aops.c | 59 +++++++++++++++++++++++++------------------------------ 1 file changed, 27 insertions(+), 32 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index aa8a6f0..6149429 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -274,54 +274,49 @@ xfs_end_io( struct xfs_ioend *ioend = container_of(work, struct xfs_ioend, io_work); struct xfs_inode *ip = XFS_I(ioend->io_inode); + xfs_off_t offset = ioend->io_offset; + size_t size = ioend->io_size; int error = ioend->io_bio->bi_error; /* - * Set an error if the mount has shut down and proceed with end I/O - * processing so it can perform whatever cleanups are necessary. + * Just clean up the in-memory strutures if the fs has been shut down. */ - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { error = -EIO; + goto done; + } /* - * For a CoW extent, we need to move the mapping from the CoW fork - * to the data fork. If instead an error happened, just dump the - * new blocks. + * Clean up any COW blocks on an I/O error. */ - if (ioend->io_type == XFS_IO_COW) { - if (error) - goto done; - if (ioend->io_bio->bi_error) { - error = xfs_reflink_cancel_cow_range(ip, - ioend->io_offset, ioend->io_size, true); - goto done; + if (unlikely(error)) { + switch (ioend->io_type) { + case XFS_IO_COW: + xfs_reflink_cancel_cow_range(ip, offset, size, true); + break; } - error = xfs_reflink_end_cow(ip, ioend->io_offset, - ioend->io_size); - if (error) - goto done; + + goto done; } /* - * For unwritten extents we need to issue transactions to convert a - * range to normal written extens after the data I/O has finished. - * Detecting and handling completion IO errors is done individually - * for each case as different cleanup operations need to be performed - * on error. + * Success: commit the COW or unwritten blocks if needed. */ - if (ioend->io_type == XFS_IO_UNWRITTEN) { - if (error) - goto done; - error = xfs_iomap_write_unwritten(ip, ioend->io_offset, - ioend->io_size); - } else if (ioend->io_append_trans) { - error = xfs_setfilesize_ioend(ioend, error); - } else { - ASSERT(!xfs_ioend_is_append(ioend) || - ioend->io_type == XFS_IO_COW); + switch (ioend->io_type) { + case XFS_IO_COW: + error = xfs_reflink_end_cow(ip, offset, size); + break; + case XFS_IO_UNWRITTEN: + error = xfs_iomap_write_unwritten(ip, offset, size); + break; + default: + ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans); + break; } done: + if (ioend->io_append_trans) + error = xfs_setfilesize_ioend(ioend, error); xfs_destroy_ioend(ioend, error); } -- cgit v1.1 From d5825712ee98d68a2c17bc89dad2c30276894cba Mon Sep 17 00:00:00 2001 From: Chandan Rajendra Date: Thu, 2 Mar 2017 15:06:33 -0800 Subject: xfs: Use xfs_icluster_size_fsb() to calculate inode alignment mask When block size is larger than inode cluster size, the call to XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size) returns 0. Also, mkfs.xfs would have set xfs_sb->sb_inoalignmt to 0. Hence in xfs_set_inoalignment(), xfs_mount->m_inoalign_mask gets initialized to -1 instead of 0. However, xfs_mount->m_sinoalign would get correctly intialized to 0 because for every positive value of xfs_mount->m_dalign, the condition "!(mp->m_dalign & mp->m_inoalign_mask)" would evaluate to false. Also, xfs_imap() worked fine even with xfs_mount->m_inoalign_mask having -1 as the value because blks_per_cluster variable would have the value 1 and hence we would never have a need to use xfs_mount->m_inoalign_mask to compute the inode chunk's agbno and offset within the chunk. Signed-off-by: Chandan Rajendra Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_mount.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 450bde6..688ebff 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -513,8 +513,7 @@ STATIC void xfs_set_inoalignment(xfs_mount_t *mp) { if (xfs_sb_version_hasalign(&mp->m_sb) && - mp->m_sb.sb_inoalignmt >= - XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size)) + mp->m_sb.sb_inoalignmt >= xfs_icluster_size_fsb(mp)) mp->m_inoalign_mask = mp->m_sb.sb_inoalignmt - 1; else mp->m_inoalign_mask = 0; -- cgit v1.1 From 08b005f1333154ae5b404ca28766e0ffb9f1c150 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 6 Mar 2017 11:58:20 -0800 Subject: xfs: remove kmem_zalloc_greedy The sole remaining caller of kmem_zalloc_greedy is bulkstat, which uses it to grab 1-4 pages for staging of inobt records. The infinite loop in the greedy allocation function is causing hangs[1] in generic/269, so just get rid of the greedy allocator in favor of kmem_zalloc_large. This makes bulkstat somewhat more likely to ENOMEM if there's really no pages to spare, but eliminates a source of hangs. [1] http://lkml.kernel.org/r/20170301044634.rgidgdqqiiwsmfpj%40XZHOUW.usersys.redhat.com Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- v2: remove single-page fallback --- fs/xfs/kmem.c | 18 ------------------ fs/xfs/kmem.h | 2 -- fs/xfs/xfs_itable.c | 6 ++---- 3 files changed, 2 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index 2dfdc62..70a5b55 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -25,24 +25,6 @@ #include "kmem.h" #include "xfs_message.h" -/* - * Greedy allocation. May fail and may return vmalloced memory. - */ -void * -kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize) -{ - void *ptr; - size_t kmsize = maxsize; - - while (!(ptr = vzalloc(kmsize))) { - if ((kmsize >>= 1) <= minsize) - kmsize = minsize; - } - if (ptr) - *size = kmsize; - return ptr; -} - void * kmem_alloc(size_t size, xfs_km_flags_t flags) { diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index 689f746..f0fc84f 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -69,8 +69,6 @@ static inline void kmem_free(const void *ptr) } -extern void *kmem_zalloc_greedy(size_t *, size_t, size_t); - static inline void * kmem_zalloc(size_t size, xfs_km_flags_t flags) { diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 66e8817..2a6d9b1 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -361,7 +361,6 @@ xfs_bulkstat( xfs_agino_t agino; /* inode # in allocation group */ xfs_agnumber_t agno; /* allocation group number */ xfs_btree_cur_t *cur; /* btree cursor for ialloc btree */ - size_t irbsize; /* size of irec buffer in bytes */ xfs_inobt_rec_incore_t *irbuf; /* start of irec buffer */ int nirbuf; /* size of irbuf */ int ubcount; /* size of user's buffer */ @@ -388,11 +387,10 @@ xfs_bulkstat( *ubcountp = 0; *done = 0; - irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4); + irbuf = kmem_zalloc_large(PAGE_SIZE * 4, KM_SLEEP); if (!irbuf) return -ENOMEM; - - nirbuf = irbsize / sizeof(*irbuf); + nirbuf = (PAGE_SIZE * 4) / sizeof(*irbuf); /* * Loop over the allocation groups, starting from the last -- cgit v1.1 From f65e6fad293b3a5793b7fa2044800506490e7a2e Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Wed, 8 Mar 2017 09:58:08 -0800 Subject: xfs: use iomap new flag for newly allocated delalloc blocks Commit fa7f138 ("xfs: clear delalloc and cache on buffered write failure") fixed one regression in the iomap error handling code and exposed another. The fundamental problem is that if a buffered write is a rewrite of preexisting delalloc blocks and the write fails, the failure handling code can punch out preexisting blocks with valid file data. This was reproduced directly by sub-block writes in the LTP kernel/syscalls/write/write03 test. A first 100 byte write allocates a single block in a file. A subsequent 100 byte write fails and punches out the block, including the data successfully written by the previous write. To address this problem, update the ->iomap_begin() handler to distinguish newly allocated delalloc blocks from preexisting delalloc blocks via the IOMAP_F_NEW flag. Use this flag in the ->iomap_end() handler to decide when a failed or short write should punch out delalloc blocks. This introduces the subtle requirement that ->iomap_begin() should never combine newly allocated delalloc blocks with existing blocks in the resulting iomap descriptor. This can occur when a new delalloc reservation merges with a neighboring extent that is part of the current write, for example. Therefore, drop the post-allocation extent lookup from xfs_bmapi_reserve_delalloc() and just return the record inserted into the fork. This ensures only new blocks are returned and thus that preexisting delalloc blocks are always handled as "found" blocks and not punched out on a failed rewrite. Reported-by: Xiong Zhou Signed-off-by: Brian Foster Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap.c | 24 ++++++++++++++---------- fs/xfs/xfs_iomap.c | 25 ++++++++++++++++++------- 2 files changed, 32 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index a9c66d4..bfa59a1 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4150,6 +4150,19 @@ xfs_bmapi_read( return 0; } +/* + * Add a delayed allocation extent to an inode. Blocks are reserved from the + * global pool and the extent inserted into the inode in-core extent tree. + * + * On entry, got refers to the first extent beyond the offset of the extent to + * allocate or eof is specified if no such extent exists. On return, got refers + * to the extent record that was inserted to the inode fork. + * + * Note that the allocated extent may have been merged with contiguous extents + * during insertion into the inode fork. Thus, got does not reflect the current + * state of the inode fork on return. If necessary, the caller can use lastx to + * look up the updated record in the inode fork. + */ int xfs_bmapi_reserve_delalloc( struct xfs_inode *ip, @@ -4236,13 +4249,8 @@ xfs_bmapi_reserve_delalloc( got->br_startblock = nullstartblock(indlen); got->br_blockcount = alen; got->br_state = XFS_EXT_NORM; - xfs_bmap_add_extent_hole_delay(ip, whichfork, lastx, got); - /* - * Update our extent pointer, given that xfs_bmap_add_extent_hole_delay - * might have merged it into one of the neighbouring ones. - */ - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), got); + xfs_bmap_add_extent_hole_delay(ip, whichfork, lastx, got); /* * Tag the inode if blocks were preallocated. Note that COW fork @@ -4254,10 +4262,6 @@ xfs_bmapi_reserve_delalloc( if (whichfork == XFS_COW_FORK && (prealloc || aoff < off || alen > len)) xfs_inode_set_cowblocks_tag(ip); - ASSERT(got->br_startoff <= aoff); - ASSERT(got->br_startoff + got->br_blockcount >= aoff + alen); - ASSERT(isnullstartblock(got->br_startblock)); - ASSERT(got->br_state == XFS_EXT_NORM); return 0; out_unreserve_blocks: diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 41662fb..288ee5b 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -630,6 +630,11 @@ retry: goto out_unlock; } + /* + * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch + * them out if the write happens to fail. + */ + iomap->flags = IOMAP_F_NEW; trace_xfs_iomap_alloc(ip, offset, count, 0, &got); done: if (isnullstartblock(got.br_startblock)) @@ -1071,16 +1076,22 @@ xfs_file_iomap_end_delalloc( struct xfs_inode *ip, loff_t offset, loff_t length, - ssize_t written) + ssize_t written, + struct iomap *iomap) { struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t start_fsb; xfs_fileoff_t end_fsb; int error = 0; - /* behave as if the write failed if drop writes is enabled */ - if (xfs_mp_drop_writes(mp)) + /* + * Behave as if the write failed if drop writes is enabled. Set the NEW + * flag to force delalloc cleanup. + */ + if (xfs_mp_drop_writes(mp)) { + iomap->flags |= IOMAP_F_NEW; written = 0; + } /* * start_fsb refers to the first unused block after a short write. If @@ -1094,14 +1105,14 @@ xfs_file_iomap_end_delalloc( end_fsb = XFS_B_TO_FSB(mp, offset + length); /* - * Trim back delalloc blocks if we didn't manage to write the whole - * range reserved. + * Trim delalloc blocks if they were allocated by this write and we + * didn't manage to write the whole range. * * We don't need to care about racing delalloc as we hold i_mutex * across the reserve/allocate/unreserve calls. If there are delalloc * blocks in the range, they are ours. */ - if (start_fsb < end_fsb) { + if ((iomap->flags & IOMAP_F_NEW) && start_fsb < end_fsb) { truncate_pagecache_range(VFS_I(ip), XFS_FSB_TO_B(mp, start_fsb), XFS_FSB_TO_B(mp, end_fsb) - 1); @@ -1131,7 +1142,7 @@ xfs_file_iomap_end( { if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC) return xfs_file_iomap_end_delalloc(XFS_I(inode), offset, - length, written); + length, written, iomap); return 0; } -- cgit v1.1 From 2fcc319d2467a5f5b78f35f79fd6e22741a31b1e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 8 Mar 2017 10:38:53 -0800 Subject: xfs: try any AG when allocating the first btree block when reflinking When a reflink operation causes the bmap code to allocate a btree block we're currently doing single-AG allocations due to having ->firstblock set and then try any higher AG due a little reflink quirk we've put in when adding the reflink code. But given that we do not have a minleft reservation of any kind in this AG we can still not have any space in the same or higher AG even if the file system has enough free space. To fix this use a XFS_ALLOCTYPE_FIRST_AG allocation in this fall back path instead. [And yes, we need to redo this properly instead of piling hacks over hacks. I'm working on that, but it's not going to be a small series. In the meantime this fixes the customer reported issue] Also add a warning for failing allocations to make it easier to debug. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap.c | 10 +++++++--- fs/xfs/libxfs/xfs_bmap_btree.c | 6 +++--- 2 files changed, 10 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index bfa59a1..9bd104f 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -763,8 +763,8 @@ xfs_bmap_extents_to_btree( args.type = XFS_ALLOCTYPE_START_BNO; args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino); } else if (dfops->dop_low) { -try_another_ag: args.type = XFS_ALLOCTYPE_START_BNO; +try_another_ag: args.fsbno = *firstblock; } else { args.type = XFS_ALLOCTYPE_NEAR_BNO; @@ -790,13 +790,17 @@ try_another_ag: if (xfs_sb_version_hasreflink(&cur->bc_mp->m_sb) && args.fsbno == NULLFSBLOCK && args.type == XFS_ALLOCTYPE_NEAR_BNO) { - dfops->dop_low = true; + args.type = XFS_ALLOCTYPE_FIRST_AG; goto try_another_ag; } + if (WARN_ON_ONCE(args.fsbno == NULLFSBLOCK)) { + xfs_iroot_realloc(ip, -1, whichfork); + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return -ENOSPC; + } /* * Allocation can't fail, the space was reserved. */ - ASSERT(args.fsbno != NULLFSBLOCK); ASSERT(*firstblock == NULLFSBLOCK || args.agno >= XFS_FSB_TO_AGNO(mp, *firstblock)); *firstblock = cur->bc_private.b.firstblock = args.fsbno; diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index f93072b..fd55db4 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -447,8 +447,8 @@ xfs_bmbt_alloc_block( if (args.fsbno == NULLFSBLOCK) { args.fsbno = be64_to_cpu(start->l); -try_another_ag: args.type = XFS_ALLOCTYPE_START_BNO; +try_another_ag: /* * Make sure there is sufficient room left in the AG to * complete a full tree split for an extent insert. If @@ -488,8 +488,8 @@ try_another_ag: if (xfs_sb_version_hasreflink(&cur->bc_mp->m_sb) && args.fsbno == NULLFSBLOCK && args.type == XFS_ALLOCTYPE_NEAR_BNO) { - cur->bc_private.b.dfops->dop_low = true; args.fsbno = cur->bc_private.b.firstblock; + args.type = XFS_ALLOCTYPE_FIRST_AG; goto try_another_ag; } @@ -506,7 +506,7 @@ try_another_ag: goto error0; cur->bc_private.b.dfops->dop_low = true; } - if (args.fsbno == NULLFSBLOCK) { + if (WARN_ON_ONCE(args.fsbno == NULLFSBLOCK)) { XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 0; return 0; -- cgit v1.1 From 04bb94b13c02e9dbc92d622cddf88937127bd7ed Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 8 Mar 2017 10:42:13 -0800 Subject: overlayfs: remove now unnecessary header file include This removes the extra include header file that was added in commit e58bc927835a "Pull overlayfs updates from Miklos Szeredi" now that it is no longer needed. There are probably other such includes that got added during the scheduler header splitup series, but this is the one that annoyed me personally and I know about. Signed-off-by: Linus Torvalds --- fs/overlayfs/util.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 1953986..6e610a2 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -12,7 +12,6 @@ #include #include #include -#include #include "overlayfs.h" #include "ovl_entry.h" -- cgit v1.1 From c2febafc67734a62196c1b9dfba926412d4077ba Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 9 Mar 2017 17:24:07 +0300 Subject: mm: convert generic code to 5-level paging Convert all non-architecture-specific code to 5-level paging. It's mostly mechanical adding handling one more page table level in places where we deal with pud_t. Signed-off-by: Kirill A. Shutemov Acked-by: Michal Hocko Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 973607d..02ce394 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -267,6 +267,7 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, { struct mm_struct *mm = ctx->mm; pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd, _pmd; pte_t *pte; @@ -277,7 +278,10 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, pgd = pgd_offset(mm, address); if (!pgd_present(*pgd)) goto out; - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + if (!p4d_present(*p4d)) + goto out; + pud = pud_offset(p4d, address); if (!pud_present(*pud)) goto out; pmd = pmd_offset(pud, address); -- cgit v1.1 From 6bbc4a4144b1a69743022ac68dfaf6e7d993abb9 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 9 Mar 2017 16:16:28 -0800 Subject: userfaultfd: shmem: __do_fault requires VM_FAULT_NOPAGE __do_fault assumes vmf->page has been initialized and is valid if VM_FAULT_NOPAGE is not returned by vma->vm_ops->fault(vma, vmf). handle_userfault() in turn should return VM_FAULT_NOPAGE if it doesn't return VM_FAULT_SIGBUS or VM_FAULT_RETRY (the other two possibilities). This VM_FAULT_NOPAGE case is only invoked when signal are pending and it didn't matter for anonymous memory before. It only started to matter since shmem was introduced. hugetlbfs also takes a different path and doesn't exercise __do_fault. Link: http://lkml.kernel.org/r/20170228154201.GH5816@redhat.com Signed-off-by: Andrea Arcangeli Reported-by: Dmitry Vyukov Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 973607d..f62199b 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -490,7 +490,7 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason) * in such case. */ down_read(&mm->mmap_sem); - ret = 0; + ret = VM_FAULT_NOPAGE; } } -- cgit v1.1 From dd0db88d8094a6d9d4d1fc5fcd56ab619f54ccf8 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 9 Mar 2017 16:16:49 -0800 Subject: userfaultfd: non-cooperative: rollback userfaultfd_exit Patch series "userfaultfd non-cooperative further update for 4.11 merge window". Unfortunately I noticed one relevant bug in userfaultfd_exit while doing more testing. I've been doing testing before and this was also tested by kbuild bot and exercised by the selftest, but this bug never reproduced before. I dropped userfaultfd_exit as result. I dropped it because of implementation difficulty in receiving signals in __mmput and because I think -ENOSPC as result from the background UFFDIO_COPY should be enough already. Before I decided to remove userfaultfd_exit, I noticed userfaultfd_exit wasn't exercised by the selftest and when I tried to exercise it, after moving it to a more correct place in __mmput where it would make more sense and where the vma list is stable, it resulted in the event_wait_completion in D state. So then I added the second patch to be sure even if we call userfaultfd_event_wait_completion too late during task exit(), we won't risk to generate tasks in D state. The same check exists in handle_userfault() for the same reason, except it makes a difference there, while here is just a robustness check and it's run under WARN_ON_ONCE. While looking at the userfaultfd_event_wait_completion() function I looked back at its callers too while at it and I think it's not ok to stop executing dup_fctx on the fcs list because we relay on userfaultfd_event_wait_completion to execute userfaultfd_ctx_put(fctx->orig) which is paired against userfaultfd_ctx_get(fctx->orig) in dup_userfault just before list_add(fcs). This change only takes care of fctx->orig but this area also needs further review looking for similar problems in fctx->new. The only patch that is urgent is the first because it's an use after free during a SMP race condition that affects all processes if CONFIG_USERFAULTFD=y. Very hard to reproduce though and probably impossible without SLUB poisoning enabled. This patch (of 3): I once reproduced this oops with the userfaultfd selftest, it's not easily reproducible and it requires SLUB poisoning to reproduce. general protection fault: 0000 [#1] SMP Modules linked in: CPU: 2 PID: 18421 Comm: userfaultfd Tainted: G ------------ T 3.10.0+ #15 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.10.1-0-g8891697-prebuilt.qemu-project.org 04/01/2014 task: ffff8801f83b9440 ti: ffff8801f833c000 task.ti: ffff8801f833c000 RIP: 0010:[] [] userfaultfd_exit+0x29/0xa0 RSP: 0018:ffff8801f833fe80 EFLAGS: 00010202 RAX: ffff8801f833ffd8 RBX: 6b6b6b6b6b6b6b6b RCX: ffff8801f83b9440 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff8800baf18600 RBP: ffff8801f833fee8 R08: 0000000000000000 R09: 0000000000000001 R10: 0000000000000000 R11: ffffffff8127ceb3 R12: 0000000000000000 R13: ffff8800baf186b0 R14: ffff8801f83b99f8 R15: 00007faed746c700 FS: 0000000000000000(0000) GS:ffff88023fc80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 00007faf0966f028 CR3: 0000000001bc6000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Call Trace: do_exit+0x297/0xd10 SyS_exit+0x17/0x20 tracesys+0xdd/0xe2 Code: 00 00 66 66 66 66 90 55 48 89 e5 41 54 53 48 83 ec 58 48 8b 1f 48 85 db 75 11 eb 73 66 0f 1f 44 00 00 48 8b 5b 10 48 85 db 74 64 <4c> 8b a3 b8 00 00 00 4d 85 e4 74 eb 41 f6 84 24 2c 01 00 00 80 RIP [] userfaultfd_exit+0x29/0xa0 RSP ---[ end trace 9fecd6dcb442846a ]--- In the debugger I located the "mm" pointer in the stack and walking mm->mmap->vm_next through the end shows the vma->vm_next list is fully consistent and it is null terminated list as expected. So this has to be an SMP race condition where userfaultfd_exit was running while the vma list was being modified by another CPU. When userfaultfd_exit() run one of the ->vm_next pointers pointed to SLAB_POISON (RBX is the vma pointer and is 0x6b6b..). The reason is that it's not running in __mmput but while there are still other threads running and it's not holding the mmap_sem (it can't as it has to wait the even to be received by the manager). So this is an use after free that was happening for all processes. One more implementation problem aside from the race condition: userfaultfd_exit has really to check a flag in mm->flags before walking the vma or it's going to slowdown the exit() path for regular tasks. One more implementation problem: at that point signals can't be delivered so it would also create a task in D state if the manager doesn't read the event. The major design issue: it overall looks superfluous as the manager can check for -ENOSPC in the background transfer: if (mmget_not_zero(ctx->mm)) { [..] } else { return -ENOSPC; } It's safer to roll it back and re-introduce it later if at all. [rppt@linux.vnet.ibm.com: documentation fixup after removal of UFFD_EVENT_EXIT] Link: http://lkml.kernel.org/r/1488345437-4364-1-git-send-email-rppt@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/20170224181957.19736-2-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Signed-off-by: Mike Rapoport Acked-by: Mike Rapoport Cc: "Dr. David Alan Gilbert" Cc: Mike Kravetz Cc: Pavel Emelyanov Cc: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 28 ---------------------------- 1 file changed, 28 deletions(-) (limited to 'fs') diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index f62199b..16d0cc6 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -775,34 +775,6 @@ void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf) } } -void userfaultfd_exit(struct mm_struct *mm) -{ - struct vm_area_struct *vma = mm->mmap; - - /* - * We can do the vma walk without locking because the caller - * (exit_mm) knows it now has exclusive access - */ - while (vma) { - struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx; - - if (ctx && (ctx->features & UFFD_FEATURE_EVENT_EXIT)) { - struct userfaultfd_wait_queue ewq; - - userfaultfd_ctx_get(ctx); - - msg_init(&ewq.msg); - ewq.msg.event = UFFD_EVENT_EXIT; - - userfaultfd_event_wait_completion(ctx, &ewq); - - ctx->features &= ~UFFD_FEATURE_EVENT_EXIT; - } - - vma = vma->vm_next; - } -} - static int userfaultfd_release(struct inode *inode, struct file *file) { struct userfaultfd_ctx *ctx = file->private_data; -- cgit v1.1 From 9a69a829f9b656c2a220d65a94ecf7b5887c5da1 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 9 Mar 2017 16:16:52 -0800 Subject: userfaultfd: non-cooperative: robustness check Similar to the handle_userfault() case, also make sure to never attempt to send any event past the PF_EXITING point of no return. This is purely a robustness check. Link: http://lkml.kernel.org/r/20170224181957.19736-3-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Acked-by: Mike Rapoport Cc: "Dr. David Alan Gilbert" Cc: Mike Kravetz Cc: Pavel Emelyanov Cc: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 16d0cc6..668bbbd 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -530,8 +530,13 @@ out: static int userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, struct userfaultfd_wait_queue *ewq) { - int ret = 0; + int ret; + + ret = -1; + if (WARN_ON_ONCE(current->flags & PF_EXITING)) + goto out; + ret = 0; ewq->ctx = ctx; init_waitqueue_entry(&ewq->wq, current); @@ -566,7 +571,7 @@ static int userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, * ctx may go away after this if the userfault pseudo fd is * already released. */ - +out: userfaultfd_ctx_put(ctx); return ret; } -- cgit v1.1 From 8c9e7bb7a41f2bbd54b2caefb274fb3de239819f Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 9 Mar 2017 16:16:54 -0800 Subject: userfaultfd: non-cooperative: release all ctx in dup_userfaultfd_complete Don't stop running dup_fctx() even if userfaultfd_event_wait_completion fails as it has to run userfaultfd_ctx_put on all ctx to pair against the userfaultfd_ctx_get that was run on all fctx->orig in dup_userfaultfd. Link: http://lkml.kernel.org/r/20170224181957.19736-4-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Acked-by: Mike Rapoport Cc: "Dr. David Alan Gilbert" Cc: Mike Kravetz Cc: Pavel Emelyanov Cc: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 668bbbd..dd48052 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -527,16 +527,12 @@ out: return ret; } -static int userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, - struct userfaultfd_wait_queue *ewq) +static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, + struct userfaultfd_wait_queue *ewq) { - int ret; - - ret = -1; if (WARN_ON_ONCE(current->flags & PF_EXITING)) goto out; - ret = 0; ewq->ctx = ctx; init_waitqueue_entry(&ewq->wq, current); @@ -552,7 +548,6 @@ static int userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, break; if (ACCESS_ONCE(ctx->released) || fatal_signal_pending(current)) { - ret = -1; __remove_wait_queue(&ctx->event_wqh, &ewq->wq); break; } @@ -573,7 +568,6 @@ static int userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, */ out: userfaultfd_ctx_put(ctx); - return ret; } static void userfaultfd_event_complete(struct userfaultfd_ctx *ctx, @@ -631,7 +625,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) return 0; } -static int dup_fctx(struct userfaultfd_fork_ctx *fctx) +static void dup_fctx(struct userfaultfd_fork_ctx *fctx) { struct userfaultfd_ctx *ctx = fctx->orig; struct userfaultfd_wait_queue ewq; @@ -641,17 +635,15 @@ static int dup_fctx(struct userfaultfd_fork_ctx *fctx) ewq.msg.event = UFFD_EVENT_FORK; ewq.msg.arg.reserved.reserved1 = (unsigned long)fctx->new; - return userfaultfd_event_wait_completion(ctx, &ewq); + userfaultfd_event_wait_completion(ctx, &ewq); } void dup_userfaultfd_complete(struct list_head *fcs) { - int ret = 0; struct userfaultfd_fork_ctx *fctx, *n; list_for_each_entry_safe(fctx, n, fcs, list) { - if (!ret) - ret = dup_fctx(fctx); + dup_fctx(fctx); list_del(&fctx->list); kfree(fctx); } -- cgit v1.1 From 7eb76d457fd758d396bc2e65cb0ace5aae614149 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 9 Mar 2017 16:17:09 -0800 Subject: userfaultfd: non-cooperative: fix fork fctx->new memleak We have a memleak in the ->new ctx if the uffd of the parent is closed before the fork event is read, nothing frees the new context. Link: http://lkml.kernel.org/r/20170302173738.18994-2-aarcange@redhat.com Signed-off-by: Mike Rapoport Signed-off-by: Andrea Arcangeli Reported-by: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: Mike Kravetz Cc: Pavel Emelyanov Cc: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs') diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index dd48052..2407249 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -549,6 +549,15 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, if (ACCESS_ONCE(ctx->released) || fatal_signal_pending(current)) { __remove_wait_queue(&ctx->event_wqh, &ewq->wq); + if (ewq->msg.event == UFFD_EVENT_FORK) { + struct userfaultfd_ctx *new; + + new = (struct userfaultfd_ctx *) + (unsigned long) + ewq->msg.arg.reserved.reserved1; + + userfaultfd_ctx_put(new); + } break; } -- cgit v1.1 From 70ccb92fdd90b35bb6f9200093d4ffd6cb38156b Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 9 Mar 2017 16:17:11 -0800 Subject: userfaultfd: non-cooperative: userfaultfd_remove revalidate vma in MADV_DONTNEED userfaultfd_remove() has to be execute before zapping the pagetables or UFFDIO_COPY could keep filling pages after zap_page_range returned, which would result in non zero data after a MADV_DONTNEED. However userfaultfd_remove() may have to release the mmap_sem. This was handled correctly in MADV_REMOVE, but MADV_DONTNEED accessed a potentially stale vma (the very vma passed to zap_page_range(vma, ...)). The fix consists in revalidating the vma in case userfaultfd_remove() had to release the mmap_sem. This also optimizes away an unnecessary down_read/up_read in the MADV_REMOVE case if UFFD_EVENT_FORK had to be delivered. It all remains zero runtime cost in case CONFIG_USERFAULTFD=n as userfaultfd_remove() will be defined as "true" at build time. Link: http://lkml.kernel.org/r/20170302173738.18994-3-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Acked-by: Mike Rapoport Cc: "Dr. David Alan Gilbert" Cc: Mike Kravetz Cc: Pavel Emelyanov Cc: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 2407249..9fd5e51 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -695,8 +695,7 @@ void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx, userfaultfd_event_wait_completion(ctx, &ewq); } -void userfaultfd_remove(struct vm_area_struct *vma, - struct vm_area_struct **prev, +bool userfaultfd_remove(struct vm_area_struct *vma, unsigned long start, unsigned long end) { struct mm_struct *mm = vma->vm_mm; @@ -705,13 +704,11 @@ void userfaultfd_remove(struct vm_area_struct *vma, ctx = vma->vm_userfaultfd_ctx.ctx; if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_REMOVE)) - return; + return true; userfaultfd_ctx_get(ctx); up_read(&mm->mmap_sem); - *prev = NULL; /* We wait for ACK w/o the mmap semaphore */ - msg_init(&ewq.msg); ewq.msg.event = UFFD_EVENT_REMOVE; @@ -720,7 +717,7 @@ void userfaultfd_remove(struct vm_area_struct *vma, userfaultfd_event_wait_completion(ctx, &ewq); - down_read(&mm->mmap_sem); + return false; } static bool has_unmap_ctx(struct userfaultfd_ctx *ctx, struct list_head *unmaps, -- cgit v1.1 From c0d0e351285161a515396b7b1ee53ec9ffd97e3c Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Thu, 9 Mar 2017 16:17:37 -0800 Subject: fat: fix using uninitialized fields of fat_inode/fsinfo_inode Recently fallocate patch was merged and it uses MSDOS_I(inode)->mmu_private at fat_evict_inode(). However, fat_inode/fsinfo_inode that was introduced in past didn't initialize MSDOS_I(inode) properly. With those combinations, it became the cause of accessing random entry in FAT area. Link: http://lkml.kernel.org/r/87pohrj4i8.fsf@mail.parknet.co.jp Signed-off-by: OGAWA Hirofumi Reported-by: Moreno Bartalucci Tested-by: Moreno Bartalucci Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fat/inode.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 338d2f7..a2c05f2 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -1359,6 +1359,16 @@ out: return 0; } +static void fat_dummy_inode_init(struct inode *inode) +{ + /* Initialize this dummy inode to work as no-op. */ + MSDOS_I(inode)->mmu_private = 0; + MSDOS_I(inode)->i_start = 0; + MSDOS_I(inode)->i_logstart = 0; + MSDOS_I(inode)->i_attrs = 0; + MSDOS_I(inode)->i_pos = 0; +} + static int fat_read_root(struct inode *inode) { struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); @@ -1803,12 +1813,13 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, fat_inode = new_inode(sb); if (!fat_inode) goto out_fail; - MSDOS_I(fat_inode)->i_pos = 0; + fat_dummy_inode_init(fat_inode); sbi->fat_inode = fat_inode; fsinfo_inode = new_inode(sb); if (!fsinfo_inode) goto out_fail; + fat_dummy_inode_init(fsinfo_inode); fsinfo_inode->i_ino = MSDOS_FSINFO_INO; sbi->fsinfo_inode = fsinfo_inode; insert_inode_hash(fsinfo_inode); -- cgit v1.1 From 2378cd6181edd94748d699448823609977283b11 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 9 Mar 2017 16:17:40 -0800 Subject: userfaultfd: remove wrong comment from userfaultfd_ctx_get() It's a void function, so there is no return value; Link: http://lkml.kernel.org/r/20170309150817.7510-1-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 9fd5e51..2bb1c72 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -138,8 +138,6 @@ out: * userfaultfd_ctx_get - Acquires a reference to the internal userfaultfd * context. * @ctx: [in] Pointer to the userfaultfd context. - * - * Returns: In case of success, returns not zero. */ static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx) { -- cgit v1.1 From cdfbabfb2f0ce983fdaa42f20e5f7842178fc01e Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 9 Mar 2017 08:09:05 +0000 Subject: net: Work around lockdep limitation in sockets that use sockets Lockdep issues a circular dependency warning when AFS issues an operation through AF_RXRPC from a context in which the VFS/VM holds the mmap_sem. The theory lockdep comes up with is as follows: (1) If the pagefault handler decides it needs to read pages from AFS, it calls AFS with mmap_sem held and AFS begins an AF_RXRPC call, but creating a call requires the socket lock: mmap_sem must be taken before sk_lock-AF_RXRPC (2) afs_open_socket() opens an AF_RXRPC socket and binds it. rxrpc_bind() binds the underlying UDP socket whilst holding its socket lock. inet_bind() takes its own socket lock: sk_lock-AF_RXRPC must be taken before sk_lock-AF_INET (3) Reading from a TCP socket into a userspace buffer might cause a fault and thus cause the kernel to take the mmap_sem, but the TCP socket is locked whilst doing this: sk_lock-AF_INET must be taken before mmap_sem However, lockdep's theory is wrong in this instance because it deals only with lock classes and not individual locks. The AF_INET lock in (2) isn't really equivalent to the AF_INET lock in (3) as the former deals with a socket entirely internal to the kernel that never sees userspace. This is a limitation in the design of lockdep. Fix the general case by: (1) Double up all the locking keys used in sockets so that one set are used if the socket is created by userspace and the other set is used if the socket is created by the kernel. (2) Store the kern parameter passed to sk_alloc() in a variable in the sock struct (sk_kern_sock). This informs sock_lock_init(), sock_init_data() and sk_clone_lock() as to the lock keys to be used. Note that the child created by sk_clone_lock() inherits the parent's kern setting. (3) Add a 'kern' parameter to ->accept() that is analogous to the one passed in to ->create() that distinguishes whether kernel_accept() or sys_accept4() was the caller and can be passed to sk_alloc(). Note that a lot of accept functions merely dequeue an already allocated socket. I haven't touched these as the new socket already exists before we get the parameter. Note also that there are a couple of places where I've made the accepted socket unconditionally kernel-based: irda_accept() rds_rcp_accept_one() tcp_accept_from_sock() because they follow a sock_create_kern() and accept off of that. Whilst creating this, I noticed that lustre and ocfs don't create sockets through sock_create_kern() and thus they aren't marked as for-kernel, though they appear to be internal. I wonder if these should do that so that they use the new set of lock keys. Signed-off-by: David Howells Signed-off-by: David S. Miller --- fs/dlm/lowcomms.c | 2 +- fs/ocfs2/cluster/tcp.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 7d398d3..9382db9 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -743,7 +743,7 @@ static int tcp_accept_from_sock(struct connection *con) newsock->type = con->sock->type; newsock->ops = con->sock->ops; - result = con->sock->ops->accept(con->sock, newsock, O_NONBLOCK); + result = con->sock->ops->accept(con->sock, newsock, O_NONBLOCK, true); if (result < 0) goto accept_err; diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 4348027..d0ab7e5 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1863,7 +1863,7 @@ static int o2net_accept_one(struct socket *sock, int *more) new_sock->type = sock->type; new_sock->ops = sock->ops; - ret = sock->ops->accept(sock, new_sock, O_NONBLOCK); + ret = sock->ops->accept(sock, new_sock, O_NONBLOCK, false); if (ret < 0) goto out; -- cgit v1.1 From 4a3a485b1ed0e109718cc8c9d094fa0f552de9b2 Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Fri, 10 Mar 2017 12:09:49 -0800 Subject: writeback: fix memory leak in wb_queue_work() When WB_registered flag is not set, wb_queue_work() skips queuing the work, but does not perform the necessary clean up. In particular, if work->auto_free is true, it should free the memory. The leak condition can be reprouced by following these steps: mount /dev/sdb /mnt/sdb /* In qemu console: device_del sdb */ umount /dev/sdb Above will result in a wb_queue_work() call on an unregistered wb and thus leak memory. Reported-by: John Sperbeck Signed-off-by: Tahsin Erdogan Reviewed-by: Jan Kara Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index ef60059..63ee294 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -173,19 +173,33 @@ static void wb_wakeup(struct bdi_writeback *wb) spin_unlock_bh(&wb->work_lock); } +static void finish_writeback_work(struct bdi_writeback *wb, + struct wb_writeback_work *work) +{ + struct wb_completion *done = work->done; + + if (work->auto_free) + kfree(work); + if (done && atomic_dec_and_test(&done->cnt)) + wake_up_all(&wb->bdi->wb_waitq); +} + static void wb_queue_work(struct bdi_writeback *wb, struct wb_writeback_work *work) { trace_writeback_queue(wb, work); - spin_lock_bh(&wb->work_lock); - if (!test_bit(WB_registered, &wb->state)) - goto out_unlock; if (work->done) atomic_inc(&work->done->cnt); - list_add_tail(&work->list, &wb->work_list); - mod_delayed_work(bdi_wq, &wb->dwork, 0); -out_unlock: + + spin_lock_bh(&wb->work_lock); + + if (test_bit(WB_registered, &wb->state)) { + list_add_tail(&work->list, &wb->work_list); + mod_delayed_work(bdi_wq, &wb->dwork, 0); + } else + finish_writeback_work(wb, work); + spin_unlock_bh(&wb->work_lock); } @@ -1873,16 +1887,9 @@ static long wb_do_writeback(struct bdi_writeback *wb) set_bit(WB_writeback_running, &wb->state); while ((work = get_next_work_item(wb)) != NULL) { - struct wb_completion *done = work->done; - trace_writeback_exec(wb, work); - wrote += wb_writeback(wb, work); - - if (work->auto_free) - kfree(work); - if (done && atomic_dec_and_test(&done->cnt)) - wake_up_all(&wb->bdi->wb_waitq); + finish_writeback_work(wb, work); } /* -- cgit v1.1 From 6f1f622019f95d79d6e2f8bb3781144ad0aff75f Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Mon, 6 Mar 2017 17:49:42 +0800 Subject: nfs4: fix a typo of NFS_ATTR_FATTR_GROUP_NAME This typo cause a memory leak, and a bad client's group id. unreferenced object 0xffff96d8073998d0 (size 8): comm "kworker/0:3", pid 34224, jiffies 4295361338 (age 761.752s) hex dump (first 8 bytes): 30 00 39 07 d8 96 ff ff 0.9..... backtrace: [] kmemleak_alloc+0x4a/0xa0 [] __kmalloc+0x140/0x220 [] xdr_stream_decode_string_dup+0x7c/0x110 [sunrpc] [] decode_getfattr_attrs+0x940/0x1630 [nfsv4] [] decode_getfattr_generic.constprop.108+0x9b/0x100 [nfsv4] [] nfs4_xdr_dec_open+0xcf/0x100 [nfsv4] [] rpcauth_unwrap_resp+0xa7/0xe0 [sunrpc] [] call_decode+0x1e0/0x810 [sunrpc] [] __rpc_execute+0x8d/0x420 [sunrpc] [] rpc_async_schedule+0x12/0x20 [sunrpc] [] process_one_work+0x197/0x430 [] worker_thread+0x4e/0x4a0 [] kthread+0x101/0x140 [] ret_from_fork+0x2c/0x40 [] 0xffffffffffffffff Fixes: 686a816ab6 ("NFSv4: Clean up owner/group attribute decode") Signed-off-by: Kinglong Mee Signed-off-by: Anna Schumaker --- fs/nfs/nfs4xdr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index f0369e3..80ce289 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -3942,7 +3942,7 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, if (len <= 0) goto out; dprintk("%s: name=%s\n", __func__, group_name->data); - return NFS_ATTR_FATTR_OWNER_NAME; + return NFS_ATTR_FATTR_GROUP_NAME; } else { len = xdr_stream_decode_opaque_inline(xdr, (void **)&p, XDR_MAX_NETOBJ); -- cgit v1.1 From 366a1569bff3fe14abfdf9285e31e05e091745f5 Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Mon, 6 Mar 2017 22:29:14 +0800 Subject: NFSv4: fix a reference leak caused WARNING messages Because nfs4_opendata_access() has close the state when access is denied, so the state isn't leak. Rather than revert the commit a974deee47, I'd like clean the strange state close. [ 1615.094218] ------------[ cut here ]------------ [ 1615.094607] WARNING: CPU: 0 PID: 23702 at lib/list_debug.c:31 __list_add_valid+0x8e/0xa0 [ 1615.094913] list_add double add: new=ffff9d7901d9f608, prev=ffff9d7901d9f608, next=ffff9d7901ee8dd0. [ 1615.095458] Modules linked in: nfsv4(E) nfs(E) nfsd(E) tun bridge stp llc fuse ip_set nfnetlink vmw_vsock_vmci_transport vsock f2fs snd_seq_midi snd_seq_midi_event fscrypto coretemp ppdev crct10dif_pclmul crc32_pclmul ghash_clmulni_intel intel_rapl_perf vmw_balloon snd_ens1371 joydev gameport snd_ac97_codec ac97_bus snd_seq snd_pcm snd_rawmidi snd_timer snd_seq_device snd soundcore nfit parport_pc parport acpi_cpufreq tpm_tis tpm_tis_core tpm i2c_piix4 vmw_vmci shpchp auth_rpcgss nfs_acl lockd(E) grace sunrpc(E) xfs libcrc32c vmwgfx drm_kms_helper ttm drm crc32c_intel mptspi e1000 serio_raw scsi_transport_spi mptscsih mptbase ata_generic pata_acpi fjes [last unloaded: nfs] [ 1615.097663] CPU: 0 PID: 23702 Comm: fstest Tainted: G W E 4.11.0-rc1+ #517 [ 1615.098015] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/02/2015 [ 1615.098807] Call Trace: [ 1615.099183] dump_stack+0x63/0x86 [ 1615.099578] __warn+0xcb/0xf0 [ 1615.099967] warn_slowpath_fmt+0x5f/0x80 [ 1615.100370] __list_add_valid+0x8e/0xa0 [ 1615.100760] nfs4_put_state_owner+0x75/0xc0 [nfsv4] [ 1615.101136] __nfs4_close+0x109/0x140 [nfsv4] [ 1615.101524] nfs4_close_state+0x15/0x20 [nfsv4] [ 1615.101949] nfs4_close_context+0x21/0x30 [nfsv4] [ 1615.102691] __put_nfs_open_context+0xb8/0x110 [nfs] [ 1615.103155] put_nfs_open_context+0x10/0x20 [nfs] [ 1615.103586] nfs4_file_open+0x13b/0x260 [nfsv4] [ 1615.103978] do_dentry_open+0x20a/0x2f0 [ 1615.104369] ? nfs4_copy_file_range+0x30/0x30 [nfsv4] [ 1615.104739] vfs_open+0x4c/0x70 [ 1615.105106] ? may_open+0x5a/0x100 [ 1615.105469] path_openat+0x623/0x1420 [ 1615.105823] do_filp_open+0x91/0x100 [ 1615.106174] ? __alloc_fd+0x3f/0x170 [ 1615.106568] do_sys_open+0x130/0x220 [ 1615.106920] ? __put_cred+0x3d/0x50 [ 1615.107256] SyS_open+0x1e/0x20 [ 1615.107588] entry_SYSCALL_64_fastpath+0x1a/0xa9 [ 1615.107922] RIP: 0033:0x7fab599069b0 [ 1615.108247] RSP: 002b:00007ffcf0600d78 EFLAGS: 00000246 ORIG_RAX: 0000000000000002 [ 1615.108575] RAX: ffffffffffffffda RBX: 00007fab59bcfae0 RCX: 00007fab599069b0 [ 1615.108896] RDX: 0000000000000200 RSI: 0000000000000200 RDI: 00007ffcf060255e [ 1615.109211] RBP: 0000000000040010 R08: 0000000000000000 R09: 0000000000000016 [ 1615.109515] R10: 00000000000006a1 R11: 0000000000000246 R12: 0000000000041000 [ 1615.109806] R13: 0000000000040010 R14: 0000000000001000 R15: 0000000000002710 [ 1615.110152] ---[ end trace 96ed63b1306bf2f3 ]--- Fixes: a974deee47 ("NFSv4: Fix memory and state leak in...") Signed-off-by: Kinglong Mee Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 1b18368..c1f5369 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2258,8 +2258,6 @@ static int nfs4_opendata_access(struct rpc_cred *cred, if ((mask & ~cache.mask & (MAY_READ | MAY_EXEC)) == 0) return 0; - /* even though OPEN succeeded, access is denied. Close the file */ - nfs4_close_state(state, fmode); return -EACCES; } -- cgit v1.1 From 630a04e79dd41ff746b545d4fc052e0abb836120 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 15 Mar 2017 00:24:25 -0700 Subject: xfs: verify inline directory data forks When we're reading or writing the data fork of an inline directory, check the contents to make sure we're not overflowing buffers or eating garbage data. xfs/348 corrupts an inline symlink into an inline directory, triggering a buffer overflow bug. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- v2: add more checks consistent with _dir2_sf_check and make the verifier usable from anywhere. --- fs/xfs/libxfs/xfs_dir2_priv.h | 2 + fs/xfs/libxfs/xfs_dir2_sf.c | 87 ++++++++++++++++++++++++++++++++++++++++++ fs/xfs/libxfs/xfs_inode_fork.c | 26 +++++++++++-- fs/xfs/libxfs/xfs_inode_fork.h | 2 +- fs/xfs/xfs_dir2_readdir.c | 11 ------ fs/xfs/xfs_inode.c | 12 ++++-- 6 files changed, 122 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h index d04547f..eb00bc1 100644 --- a/fs/xfs/libxfs/xfs_dir2_priv.h +++ b/fs/xfs/libxfs/xfs_dir2_priv.h @@ -125,6 +125,8 @@ extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino); extern int xfs_dir2_sf_lookup(struct xfs_da_args *args); extern int xfs_dir2_sf_removename(struct xfs_da_args *args); extern int xfs_dir2_sf_replace(struct xfs_da_args *args); +extern int xfs_dir2_sf_verify(struct xfs_mount *mp, struct xfs_dir2_sf_hdr *sfp, + int size); /* xfs_dir2_readdir.c */ extern int xfs_readdir(struct xfs_inode *dp, struct dir_context *ctx, diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index c6809ff..96b45cd 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -629,6 +629,93 @@ xfs_dir2_sf_check( } #endif /* DEBUG */ +/* Verify the consistency of an inline directory. */ +int +xfs_dir2_sf_verify( + struct xfs_mount *mp, + struct xfs_dir2_sf_hdr *sfp, + int size) +{ + struct xfs_dir2_sf_entry *sfep; + struct xfs_dir2_sf_entry *next_sfep; + char *endp; + const struct xfs_dir_ops *dops; + xfs_ino_t ino; + int i; + int i8count; + int offset; + __uint8_t filetype; + + dops = xfs_dir_get_ops(mp, NULL); + + /* + * Give up if the directory is way too short. + */ + XFS_WANT_CORRUPTED_RETURN(mp, size > + offsetof(struct xfs_dir2_sf_hdr, parent)); + XFS_WANT_CORRUPTED_RETURN(mp, size >= + xfs_dir2_sf_hdr_size(sfp->i8count)); + + endp = (char *)sfp + size; + + /* Check .. entry */ + ino = dops->sf_get_parent_ino(sfp); + i8count = ino > XFS_DIR2_MAX_SHORT_INUM; + XFS_WANT_CORRUPTED_RETURN(mp, !xfs_dir_ino_validate(mp, ino)); + offset = dops->data_first_offset; + + /* Check all reported entries */ + sfep = xfs_dir2_sf_firstentry(sfp); + for (i = 0; i < sfp->count; i++) { + /* + * struct xfs_dir2_sf_entry has a variable length. + * Check the fixed-offset parts of the structure are + * within the data buffer. + */ + XFS_WANT_CORRUPTED_RETURN(mp, + ((char *)sfep + sizeof(*sfep)) < endp); + + /* Don't allow names with known bad length. */ + XFS_WANT_CORRUPTED_RETURN(mp, sfep->namelen > 0); + XFS_WANT_CORRUPTED_RETURN(mp, sfep->namelen < MAXNAMELEN); + + /* + * Check that the variable-length part of the structure is + * within the data buffer. The next entry starts after the + * name component, so nextentry is an acceptable test. + */ + next_sfep = dops->sf_nextentry(sfp, sfep); + XFS_WANT_CORRUPTED_RETURN(mp, endp >= (char *)next_sfep); + + /* Check that the offsets always increase. */ + XFS_WANT_CORRUPTED_RETURN(mp, + xfs_dir2_sf_get_offset(sfep) >= offset); + + /* Check the inode number. */ + ino = dops->sf_get_ino(sfp, sfep); + i8count += ino > XFS_DIR2_MAX_SHORT_INUM; + XFS_WANT_CORRUPTED_RETURN(mp, !xfs_dir_ino_validate(mp, ino)); + + /* Check the file type. */ + filetype = dops->sf_get_ftype(sfep); + XFS_WANT_CORRUPTED_RETURN(mp, filetype < XFS_DIR3_FT_MAX); + + offset = xfs_dir2_sf_get_offset(sfep) + + dops->data_entsize(sfep->namelen); + + sfep = next_sfep; + } + XFS_WANT_CORRUPTED_RETURN(mp, i8count == sfp->i8count); + XFS_WANT_CORRUPTED_RETURN(mp, (void *)sfep == (void *)endp); + + /* Make sure this whole thing ought to be in local format. */ + XFS_WANT_CORRUPTED_RETURN(mp, offset + + (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) + + (uint)sizeof(xfs_dir2_block_tail_t) <= mp->m_dir_geo->blksize); + + return 0; +} + /* * Create a new (shortform) directory. */ diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 25c1e07..9653e96 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -33,6 +33,8 @@ #include "xfs_trace.h" #include "xfs_attr_sf.h" #include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_dir2_priv.h" kmem_zone_t *xfs_ifork_zone; @@ -320,6 +322,7 @@ xfs_iformat_local( int whichfork, int size) { + int error; /* * If the size is unreasonable, then something @@ -336,6 +339,14 @@ xfs_iformat_local( return -EFSCORRUPTED; } + if (S_ISDIR(VFS_I(ip)->i_mode) && whichfork == XFS_DATA_FORK) { + error = xfs_dir2_sf_verify(ip->i_mount, + (struct xfs_dir2_sf_hdr *)XFS_DFORK_DPTR(dip), + size); + if (error) + return error; + } + xfs_init_local_fork(ip, whichfork, XFS_DFORK_PTR(dip, whichfork), size); return 0; } @@ -856,7 +867,7 @@ xfs_iextents_copy( * In these cases, the format always takes precedence, because the * format indicates the current state of the fork. */ -void +int xfs_iflush_fork( xfs_inode_t *ip, xfs_dinode_t *dip, @@ -866,6 +877,7 @@ xfs_iflush_fork( char *cp; xfs_ifork_t *ifp; xfs_mount_t *mp; + int error; static const short brootflag[2] = { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT }; static const short dataflag[2] = @@ -874,7 +886,7 @@ xfs_iflush_fork( { XFS_ILOG_DEXT, XFS_ILOG_AEXT }; if (!iip) - return; + return 0; ifp = XFS_IFORK_PTR(ip, whichfork); /* * This can happen if we gave up in iformat in an error path, @@ -882,12 +894,19 @@ xfs_iflush_fork( */ if (!ifp) { ASSERT(whichfork == XFS_ATTR_FORK); - return; + return 0; } cp = XFS_DFORK_PTR(dip, whichfork); mp = ip->i_mount; switch (XFS_IFORK_FORMAT(ip, whichfork)) { case XFS_DINODE_FMT_LOCAL: + if (S_ISDIR(VFS_I(ip)->i_mode) && whichfork == XFS_DATA_FORK) { + error = xfs_dir2_sf_verify(mp, + (struct xfs_dir2_sf_hdr *)ifp->if_u1.if_data, + ifp->if_bytes); + if (error) + return error; + } if ((iip->ili_fields & dataflag[whichfork]) && (ifp->if_bytes > 0)) { ASSERT(ifp->if_u1.if_data != NULL); @@ -940,6 +959,7 @@ xfs_iflush_fork( ASSERT(0); break; } + return 0; } /* diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 7fb8365..132dc59 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -140,7 +140,7 @@ typedef struct xfs_ifork { struct xfs_ifork *xfs_iext_state_to_fork(struct xfs_inode *ip, int state); int xfs_iformat_fork(struct xfs_inode *, struct xfs_dinode *); -void xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *, +int xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *, struct xfs_inode_log_item *, int); void xfs_idestroy_fork(struct xfs_inode *, int); void xfs_idata_realloc(struct xfs_inode *, int, int); diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index 003a99b..ad9396e 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -71,22 +71,11 @@ xfs_dir2_sf_getdents( struct xfs_da_geometry *geo = args->geo; ASSERT(dp->i_df.if_flags & XFS_IFINLINE); - /* - * Give up if the directory is way too short. - */ - if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) { - ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount)); - return -EIO; - } - ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); ASSERT(dp->i_df.if_u1.if_data != NULL); sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; - if (dp->i_d.di_size < xfs_dir2_sf_hdr_size(sfp->i8count)) - return -EFSCORRUPTED; - /* * If the block number in the offset is out of range, we're done. */ diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 7eaf1ef..c7fe2c2 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3475,6 +3475,7 @@ xfs_iflush_int( struct xfs_inode_log_item *iip = ip->i_itemp; struct xfs_dinode *dip; struct xfs_mount *mp = ip->i_mount; + int error; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); ASSERT(xfs_isiflocked(ip)); @@ -3557,9 +3558,14 @@ xfs_iflush_int( if (ip->i_d.di_flushiter == DI_MAX_FLUSH) ip->i_d.di_flushiter = 0; - xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK); - if (XFS_IFORK_Q(ip)) - xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK); + error = xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK); + if (error) + return error; + if (XFS_IFORK_Q(ip)) { + error = xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK); + if (error) + return error; + } xfs_inobp_check(mp, bp); /* -- cgit v1.1 From 28ea06c46fbcab63fd9a55531387b7928a18a590 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 6 Mar 2017 12:58:42 -0500 Subject: gfs2: Avoid alignment hole in struct lm_lockname Commit 88ffbf3e03 switches to using rhashtables for glocks, hashing over the entire struct lm_lockname instead of its individual fields. On some architectures, struct lm_lockname contains a hole of uninitialized memory due to alignment rules, which now leads to incorrect hash values. Get rid of that hole. Signed-off-by: Andreas Gruenbacher Signed-off-by: Bob Peterson CC: #v4.3+ --- fs/gfs2/incore.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index c45084a..511e1ed 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -207,7 +207,7 @@ struct lm_lockname { struct gfs2_sbd *ln_sbd; u64 ln_number; unsigned int ln_type; -}; +} __packed __aligned(sizeof(int)); #define lm_name_equal(name1, name2) \ (((name1)->ln_number == (name2)->ln_number) && \ -- cgit v1.1 From 29c8bbbd6e21daa0997d1c3ee886b897ee7ad652 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Mar 2017 16:27:43 +0000 Subject: afs: Fix missing put_page() In afs_writepages_region(), inside the loop where we find dirty pages to deal with, one of the if-statements is missing a put_page(). Signed-off-by: David Howells --- fs/afs/write.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/afs/write.c b/fs/afs/write.c index c83c1a0..e919e64 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -513,6 +513,7 @@ static int afs_writepages_region(struct address_space *mapping, if (PageWriteback(page) || !PageDirty(page)) { unlock_page(page); + put_page(page); continue; } -- cgit v1.1 From 5611ef280d814042825ee17688f5751266fc538b Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Mar 2017 16:27:43 +0000 Subject: afs: Fix page overput in afs_fill_page() afs_fill_page() loads the page it wants to fill into the afs_read request without incrementing its refcount - but then calls afs_put_read() to clean up afterwards, which then releases a ref on the page. Fix this by getting a ref on the page before calling afs_vnode_fetch_data(). This causes sync after a write to hang in afs_writepages_region() because find_get_pages_tag() gets confused and doesn't return. Fixes: 196ee9cd2d04 ("afs: Make afs_fs_fetch_data() take a list of pages") Reported-by: Marc Dionne Signed-off-by: David Howells Tested-by: Marc Dionne --- fs/afs/write.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/afs/write.c b/fs/afs/write.c index e919e64..3ac52f6 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -101,6 +101,7 @@ static int afs_fill_page(struct afs_vnode *vnode, struct key *key, req->pos = pos; req->nr_pages = 1; req->pages[0] = page; + get_page(page); i_size = i_size_read(&vnode->vfs_inode); if (pos + PAGE_SIZE > i_size) -- cgit v1.1 From 6186f0788b31f44affceeedc7b48eb10faea120d Mon Sep 17 00:00:00 2001 From: Marc Dionne Date: Thu, 16 Mar 2017 16:27:43 +0000 Subject: afs: Populate group ID from vnode status The group was hard coded to GLOBAL_ROOT_GID; use the group ID that was received from the server. Signed-off-by: Marc Dionne Signed-off-by: David Howells --- fs/afs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 1e4897a..299dbae 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -70,7 +70,7 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key) set_nlink(inode, vnode->status.nlink); inode->i_uid = vnode->status.owner; - inode->i_gid = GLOBAL_ROOT_GID; + inode->i_gid = vnode->status.group; inode->i_size = vnode->status.size; inode->i_ctime.tv_sec = vnode->status.mtime_server; inode->i_ctime.tv_nsec = 0; -- cgit v1.1 From 627f46943ff90bcc32ddeb675d881c043c6fa2ae Mon Sep 17 00:00:00 2001 From: Marc Dionne Date: Thu, 16 Mar 2017 16:27:44 +0000 Subject: afs: Adjust mode bits processing Mode bits for an afs file should not be enforced in the usual way. For files, the absence of user bits can restrict file access with respect to what is granted by the server. These bits apply regardless of the owner or the current uid; the rest of the mode bits (group, other) are ignored. Signed-off-by: Marc Dionne Signed-off-by: David Howells --- fs/afs/security.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/afs/security.c b/fs/afs/security.c index 8d01042..bfa9d34 100644 --- a/fs/afs/security.c +++ b/fs/afs/security.c @@ -340,17 +340,22 @@ int afs_permission(struct inode *inode, int mask) } else { if (!(access & AFS_ACE_LOOKUP)) goto permission_denied; + if ((mask & MAY_EXEC) && !(inode->i_mode & S_IXUSR)) + goto permission_denied; if (mask & (MAY_EXEC | MAY_READ)) { if (!(access & AFS_ACE_READ)) goto permission_denied; + if (!(inode->i_mode & S_IRUSR)) + goto permission_denied; } else if (mask & MAY_WRITE) { if (!(access & AFS_ACE_WRITE)) goto permission_denied; + if (!(inode->i_mode & S_IWUSR)) + goto permission_denied; } } key_put(key); - ret = generic_permission(inode, mask); _leave(" = %d", ret); return ret; -- cgit v1.1 From bcd89270d93b7edebb5de5e5e7dca1a77a33496e Mon Sep 17 00:00:00 2001 From: Marc Dionne Date: Thu, 16 Mar 2017 16:27:44 +0000 Subject: afs: Deal with an empty callback array Servers may send a callback array that is the same size as the FID array, or an empty array. If the callback count is 0, the code would attempt to read (fid_count * 12) bytes of data, which would fail and result in an unmarshalling error. This would lead to stale data for remotely modified files or directories. Store the callback array size in the internal afs_call structure and use that to determine the amount of data to read. Signed-off-by: Marc Dionne --- fs/afs/cmservice.c | 11 +++++------ fs/afs/internal.h | 5 ++++- 2 files changed, 9 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index 2edbdcb..3062cce 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -187,7 +187,6 @@ static int afs_deliver_cb_callback(struct afs_call *call) struct afs_callback *cb; struct afs_server *server; __be32 *bp; - u32 tmp; int ret, loop; _enter("{%u}", call->unmarshall); @@ -249,9 +248,9 @@ static int afs_deliver_cb_callback(struct afs_call *call) if (ret < 0) return ret; - tmp = ntohl(call->tmp); - _debug("CB count: %u", tmp); - if (tmp != call->count && tmp != 0) + call->count2 = ntohl(call->tmp); + _debug("CB count: %u", call->count2); + if (call->count2 != call->count && call->count2 != 0) return -EBADMSG; call->offset = 0; call->unmarshall++; @@ -259,14 +258,14 @@ static int afs_deliver_cb_callback(struct afs_call *call) case 4: _debug("extract CB array"); ret = afs_extract_data(call, call->buffer, - call->count * 3 * 4, false); + call->count2 * 3 * 4, false); if (ret < 0) return ret; _debug("unmarshall CB array"); cb = call->request; bp = call->buffer; - for (loop = call->count; loop > 0; loop--, cb++) { + for (loop = call->count2; loop > 0; loop--, cb++) { cb->version = ntohl(*bp++); cb->expiry = ntohl(*bp++); cb->type = ntohl(*bp++); diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 5dfa569..8499870 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -90,7 +90,10 @@ struct afs_call { unsigned request_size; /* size of request data */ unsigned reply_max; /* maximum size of reply */ unsigned first_offset; /* offset into mapping[first] */ - unsigned last_to; /* amount of mapping[last] */ + union { + unsigned last_to; /* amount of mapping[last] */ + unsigned count2; /* count used in unmarshalling */ + }; unsigned char unmarshall; /* unmarshalling phase */ bool incoming; /* T if incoming call */ bool send_pages; /* T if data from mapping should be sent */ -- cgit v1.1 From 6db3ac3c4bc552837d232ec794559a2fae2815a0 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Mar 2017 16:27:44 +0000 Subject: afs: Handle better the server returning excess or short data When an AFS server is given an FS.FetchData{,64} request to read data from a file, it is permitted by the protocol to return more or less than was requested. kafs currently relies on the latter behaviour in readpage{,s} to handle a partial page at the end of the file (we just ask for a whole page and clear space beyond the short read). However, we don't handle all cases. Add: (1) Handle excess data by discarding it rather than aborting. Note that we use a common static buffer to discard into so that the decryption algorithm advances the PCBC state. (2) Handle a short read that affects more than just the last page. Note that if a read comes up unexpectedly short of long, it's possible that the server's copy of the file changed - in which case the data version number will have been incremented and the callback will have been broken - in which case all the pages currently attached to the inode will be zapped anyway at some point. Signed-off-by: David Howells --- fs/afs/file.c | 7 +++++-- fs/afs/fsclient.c | 49 +++++++++++++++++++++++++++++++++++-------------- 2 files changed, 40 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/afs/file.c b/fs/afs/file.c index ba7b71f..a38e1c3 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -184,10 +184,13 @@ int afs_page_filler(void *data, struct page *page) if (!req) goto enomem; + /* We request a full page. If the page is a partial one at the + * end of the file, the server will return a short read and the + * unmarshalling code will clear the unfilled space. + */ atomic_set(&req->usage, 1); req->pos = (loff_t)page->index << PAGE_SHIFT; - req->len = min_t(size_t, i_size_read(inode) - req->pos, - PAGE_SIZE); + req->len = PAGE_SIZE; req->nr_pages = 1; req->pages[0] = page; get_page(page); diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index ac8e766..bf8904a 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -17,6 +17,12 @@ #include "afs_fs.h" /* + * We need somewhere to discard into in case the server helpfully returns more + * than we asked for in FS.FetchData{,64}. + */ +static u8 afs_discard_buffer[64]; + +/* * decode an AFSFid block */ static void xdr_decode_AFSFid(const __be32 **_bp, struct afs_fid *fid) @@ -353,12 +359,6 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) req->actual_len |= ntohl(call->tmp); _debug("DATA length: %llu", req->actual_len); - /* Check that the server didn't want to send us extra. We - * might want to just discard instead, but that requires - * cooperation from AF_RXRPC. - */ - if (req->actual_len > req->len) - return -EBADMSG; req->remain = req->actual_len; call->offset = req->pos & (PAGE_SIZE - 1); @@ -368,6 +368,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) call->unmarshall++; begin_page: + ASSERTCMP(req->index, <, req->nr_pages); if (req->remain > PAGE_SIZE - call->offset) size = PAGE_SIZE - call->offset; else @@ -390,18 +391,37 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) if (req->page_done) req->page_done(call, req); if (req->remain > 0) { - req->index++; call->offset = 0; + req->index++; + if (req->index >= req->nr_pages) + goto begin_discard; goto begin_page; } } + goto no_more_data; + + /* Discard any excess data the server gave us */ + begin_discard: + case 4: + size = min_t(size_t, sizeof(afs_discard_buffer), req->remain); + call->count = size; + _debug("extract discard %u/%llu %zu/%u", + req->remain, req->actual_len, call->offset, call->count); + + call->offset = 0; + ret = afs_extract_data(call, afs_discard_buffer, call->count, true); + req->remain -= call->offset; + if (ret < 0) + return ret; + if (req->remain > 0) + goto begin_discard; no_more_data: call->offset = 0; - call->unmarshall++; + call->unmarshall = 5; /* extract the metadata */ - case 4: + case 5: ret = afs_extract_data(call, call->buffer, (21 + 3 + 6) * 4, false); if (ret < 0) @@ -416,16 +436,17 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) call->offset = 0; call->unmarshall++; - case 5: + case 6: break; } - if (call->count < PAGE_SIZE) { - buffer = kmap(req->pages[req->index]); - memset(buffer + call->count, 0, PAGE_SIZE - call->count); - kunmap(req->pages[req->index]); + for (; req->index < req->nr_pages; req->index++) { + if (call->count < PAGE_SIZE) + zero_user_segment(req->pages[req->index], + call->count, PAGE_SIZE); if (req->page_done) req->page_done(call, req); + call->count = 0; } _leave(" = 0 [done]"); -- cgit v1.1 From 3448e6521755862446aed28e29abf12565d8844e Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Mar 2017 16:27:44 +0000 Subject: afs: Kill struct afs_read::pg_offset Kill struct afs_read::pg_offset as nothing uses it. It's unnecessary as pos can be masked off. Signed-off-by: David Howells --- fs/afs/internal.h | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 8499870..7784a8b 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -135,7 +135,6 @@ struct afs_read { atomic_t usage; unsigned int remain; /* Amount remaining */ unsigned int index; /* Which page we're reading into */ - unsigned int pg_offset; /* Offset in page we're at */ unsigned int nr_pages; void (*page_done)(struct afs_call *, struct afs_read *); struct page *pages[]; -- cgit v1.1 From e8e581a88c5f5fc7cf1f636d122b77fbcfc8c2f6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Mar 2017 16:27:44 +0000 Subject: afs: Handle a short write to an AFS page Handle the situation where afs_write_begin() is told to expect that a full-page write will be made, but this doesn't happen (EFAULT, CTRL-C, etc.), and so afs_write_end() sees a partial write took place. Currently, no attempt is to deal with the discrepency. Fix this by loading the gap from the server. Reported-by: Al Viro Signed-off-by: David Howells --- fs/afs/fsclient.c | 4 +++- fs/afs/internal.h | 2 +- fs/afs/write.c | 28 +++++++++++++++++++--------- 3 files changed, 23 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index bf8904a..6f917dd 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -393,8 +393,10 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) if (req->remain > 0) { call->offset = 0; req->index++; - if (req->index >= req->nr_pages) + if (req->index >= req->nr_pages) { + call->unmarshall = 4; goto begin_discard; + } goto begin_page; } } diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 7784a8b..dc2cb48 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -130,7 +130,7 @@ struct afs_call_type { */ struct afs_read { loff_t pos; /* Where to start reading */ - loff_t len; /* How much to read */ + loff_t len; /* How much we're asking for */ loff_t actual_len; /* How much we're actually getting */ atomic_t usage; unsigned int remain; /* Amount remaining */ diff --git a/fs/afs/write.c b/fs/afs/write.c index 3ac52f6..ea66890 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -84,10 +84,9 @@ void afs_put_writeback(struct afs_writeback *wb) * partly or wholly fill a page that's under preparation for writing */ static int afs_fill_page(struct afs_vnode *vnode, struct key *key, - loff_t pos, struct page *page) + loff_t pos, unsigned int len, struct page *page) { struct afs_read *req; - loff_t i_size; int ret; _enter(",,%llu", (unsigned long long)pos); @@ -99,16 +98,11 @@ static int afs_fill_page(struct afs_vnode *vnode, struct key *key, atomic_set(&req->usage, 1); req->pos = pos; + req->len = len; req->nr_pages = 1; req->pages[0] = page; get_page(page); - i_size = i_size_read(&vnode->vfs_inode); - if (pos + PAGE_SIZE > i_size) - req->len = i_size - pos; - else - req->len = PAGE_SIZE; - ret = afs_vnode_fetch_data(vnode, key, req); afs_put_read(req); if (ret < 0) { @@ -164,7 +158,7 @@ int afs_write_begin(struct file *file, struct address_space *mapping, /* page won't leak in error case: it eventually gets cleaned off LRU */ if (!PageUptodate(page) && len != PAGE_SIZE) { - ret = afs_fill_page(vnode, key, index << PAGE_SHIFT, page); + ret = afs_fill_page(vnode, key, pos & PAGE_MASK, PAGE_SIZE, page); if (ret < 0) { kfree(candidate); _leave(" = %d [prep]", ret); @@ -258,7 +252,9 @@ int afs_write_end(struct file *file, struct address_space *mapping, struct page *page, void *fsdata) { struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); + struct key *key = file->private_data; loff_t i_size, maybe_i_size; + int ret; _enter("{%x:%u},{%lx}", vnode->fid.vid, vnode->fid.vnode, page->index); @@ -274,6 +270,20 @@ int afs_write_end(struct file *file, struct address_space *mapping, spin_unlock(&vnode->writeback_lock); } + if (!PageUptodate(page)) { + if (copied < len) { + /* Try and load any missing data from the server. The + * unmarshalling routine will take care of clearing any + * bits that are beyond the EOF. + */ + ret = afs_fill_page(vnode, key, pos + copied, + len - copied, page); + if (ret < 0) + return ret; + } + SetPageUptodate(page); + } + set_page_dirty(page); if (PageDirty(page)) _debug("dirtied"); -- cgit v1.1 From 58fed94dfb17e89556b5705f20f90e5b2971b6a1 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Mar 2017 16:27:45 +0000 Subject: afs: Flush outstanding writes when an fd is closed Flush outstanding writes in afs when an fd is closed. This is what NFS and CIFS do. Reported-by: Marc Dionne Signed-off-by: David Howells --- fs/afs/file.c | 1 + fs/afs/internal.h | 1 + fs/afs/write.c | 14 ++++++++++++++ 3 files changed, 16 insertions(+) (limited to 'fs') diff --git a/fs/afs/file.c b/fs/afs/file.c index a38e1c3..b582944 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -30,6 +30,7 @@ static int afs_readpages(struct file *filp, struct address_space *mapping, const struct file_operations afs_file_operations = { .open = afs_open, + .flush = afs_flush, .release = afs_release, .llseek = generic_file_llseek, .read_iter = generic_file_read_iter, diff --git a/fs/afs/internal.h b/fs/afs/internal.h index dc2cb48..af1d91e 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -720,6 +720,7 @@ extern int afs_writepages(struct address_space *, struct writeback_control *); extern void afs_pages_written_back(struct afs_vnode *, struct afs_call *); extern ssize_t afs_file_write(struct kiocb *, struct iov_iter *); extern int afs_writeback_all(struct afs_vnode *); +extern int afs_flush(struct file *, fl_owner_t); extern int afs_fsync(struct file *, loff_t, loff_t, int); diff --git a/fs/afs/write.c b/fs/afs/write.c index ea66890..f1450ea 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -758,6 +758,20 @@ out: } /* + * Flush out all outstanding writes on a file opened for writing when it is + * closed. + */ +int afs_flush(struct file *file, fl_owner_t id) +{ + _enter(""); + + if ((file->f_mode & FMODE_WRITE) == 0) + return 0; + + return vfs_fsync(file, 0); +} + +/* * notification that a previously read-only page is about to become writable * - if it returns an error, the caller will deliver a bus error signal */ -- cgit v1.1 From 944c74f472f926785b1948efa0e73e2f1b3b539b Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Mar 2017 16:27:45 +0000 Subject: afs: Distinguish mountpoints from symlinks by file mode alone In AFS, mountpoints appear as symlinks with mode 0644 and normal symlinks have mode 0777, so use this to distinguish them rather than reading the content and parsing it. In the case of a mountpoint, the symlink body is a formatted string indicating the location of the target volume. Note that with this, kAFS no longer 'pre-fetches' the contents of symlinks, so afs_readpage() may fail with an access-denial because when the VFS calls d_automount(), it wraps the call in an credentials override that sets the initial creds - thereby preventing access to the caller's keyrings and the authentication keys held therein. To this end, a patch reverting that change to the VFS is required also. Reported-by: Jeffrey Altman Signed-off-by: David Howells --- fs/afs/inode.c | 29 +++++++++++++++-------------- fs/afs/internal.h | 1 - fs/afs/mntpt.c | 53 ----------------------------------------------------- 3 files changed, 15 insertions(+), 68 deletions(-) (limited to 'fs') diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 299dbae..ade6ec3 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -54,8 +54,21 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key) inode->i_fop = &afs_dir_file_operations; break; case AFS_FTYPE_SYMLINK: - inode->i_mode = S_IFLNK | vnode->status.mode; - inode->i_op = &page_symlink_inode_operations; + /* Symlinks with a mode of 0644 are actually mountpoints. */ + if ((vnode->status.mode & 0777) == 0644) { + inode->i_flags |= S_AUTOMOUNT; + + spin_lock(&vnode->lock); + set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags); + spin_unlock(&vnode->lock); + + inode->i_mode = S_IFDIR | 0555; + inode->i_op = &afs_mntpt_inode_operations; + inode->i_fop = &afs_mntpt_file_operations; + } else { + inode->i_mode = S_IFLNK | vnode->status.mode; + inode->i_op = &page_symlink_inode_operations; + } inode_nohighmem(inode); break; default: @@ -79,18 +92,6 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key) inode->i_generation = vnode->fid.unique; inode->i_version = vnode->status.data_version; inode->i_mapping->a_ops = &afs_fs_aops; - - /* check to see whether a symbolic link is really a mountpoint */ - if (vnode->status.type == AFS_FTYPE_SYMLINK) { - afs_mntpt_check_symlink(vnode, key); - - if (test_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags)) { - inode->i_mode = S_IFDIR | vnode->status.mode; - inode->i_op = &afs_mntpt_inode_operations; - inode->i_fop = &afs_mntpt_file_operations; - } - } - return 0; } diff --git a/fs/afs/internal.h b/fs/afs/internal.h index af1d91e..39de154 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -559,7 +559,6 @@ extern const struct inode_operations afs_autocell_inode_operations; extern const struct file_operations afs_mntpt_file_operations; extern struct vfsmount *afs_d_automount(struct path *); -extern int afs_mntpt_check_symlink(struct afs_vnode *, struct key *); extern void afs_mntpt_kill_timer(void); /* diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index d4fb0af..bd3b65c 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -47,59 +47,6 @@ static DECLARE_DELAYED_WORK(afs_mntpt_expiry_timer, afs_mntpt_expiry_timed_out); static unsigned long afs_mntpt_expiry_timeout = 10 * 60; /* - * check a symbolic link to see whether it actually encodes a mountpoint - * - sets the AFS_VNODE_MOUNTPOINT flag on the vnode appropriately - */ -int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key) -{ - struct page *page; - size_t size; - char *buf; - int ret; - - _enter("{%x:%u,%u}", - vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique); - - /* read the contents of the symlink into the pagecache */ - page = read_cache_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0, - afs_page_filler, key); - if (IS_ERR(page)) { - ret = PTR_ERR(page); - goto out; - } - - ret = -EIO; - if (PageError(page)) - goto out_free; - - buf = kmap(page); - - /* examine the symlink's contents */ - size = vnode->status.size; - _debug("symlink to %*.*s", (int) size, (int) size, buf); - - if (size > 2 && - (buf[0] == '%' || buf[0] == '#') && - buf[size - 1] == '.' - ) { - _debug("symlink is a mountpoint"); - spin_lock(&vnode->lock); - set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags); - vnode->vfs_inode.i_flags |= S_AUTOMOUNT; - spin_unlock(&vnode->lock); - } - - ret = 0; - - kunmap(page); -out_free: - put_page(page); -out: - _leave(" = %d", ret); - return ret; -} - -/* * no valid lookup procedure on this sort of dir */ static struct dentry *afs_mntpt_lookup(struct inode *dir, -- cgit v1.1 From 1d7e4ebf291d3103466006926329e39aa355944f Mon Sep 17 00:00:00 2001 From: Andreea-Cristina Bernat Date: Thu, 16 Mar 2017 16:27:45 +0000 Subject: afs: inode: Replace rcu_assign_pointer() with RCU_INIT_POINTER() The use of "rcu_assign_pointer()" is NULLing out the pointer. According to RCU_INIT_POINTER()'s block comment: "1. This use of RCU_INIT_POINTER() is NULLing out the pointer" it is better to use it instead of rcu_assign_pointer() because it has a smaller overhead. The following Coccinelle semantic patch was used: @@ @@ - rcu_assign_pointer + RCU_INIT_POINTER (..., NULL) Signed-off-by: Andreea-Cristina Bernat Signed-off-by: David Howells --- fs/afs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/afs/inode.c b/fs/afs/inode.c index ade6ec3..e083e08 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -445,7 +445,7 @@ void afs_evict_inode(struct inode *inode) mutex_lock(&vnode->permits_lock); permits = vnode->permits; - rcu_assign_pointer(vnode->permits, NULL); + RCU_INIT_POINTER(vnode->permits, NULL); mutex_unlock(&vnode->permits_lock); if (permits) call_rcu(&permits->rcu, afs_zap_permits); -- cgit v1.1 From df8a09d1b8f9e693ec3f6b7e0162fc817f2cf0db Mon Sep 17 00:00:00 2001 From: Andreea-Cristina Bernat Date: Thu, 16 Mar 2017 16:27:45 +0000 Subject: afs: security: Replace rcu_assign_pointer() with RCU_INIT_POINTER() The use of "rcu_assign_pointer()" is NULLing out the pointer. According to RCU_INIT_POINTER()'s block comment: "1. This use of RCU_INIT_POINTER() is NULLing out the pointer" it is better to use it instead of rcu_assign_pointer() because it has a smaller overhead. The following Coccinelle semantic patch was used: @@ @@ - rcu_assign_pointer + RCU_INIT_POINTER (..., NULL) Signed-off-by: Andreea-Cristina Bernat Signed-off-by: David Howells --- fs/afs/security.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/afs/security.c b/fs/afs/security.c index bfa9d34..ecb86a6 100644 --- a/fs/afs/security.c +++ b/fs/afs/security.c @@ -114,7 +114,7 @@ void afs_clear_permits(struct afs_vnode *vnode) mutex_lock(&vnode->permits_lock); permits = vnode->permits; - rcu_assign_pointer(vnode->permits, NULL); + RCU_INIT_POINTER(vnode->permits, NULL); mutex_unlock(&vnode->permits_lock); if (permits) -- cgit v1.1 From 8a79790bf0b7da216627ffb85f52cfb4adbf1e4e Mon Sep 17 00:00:00 2001 From: Tina Ruchandani Date: Thu, 16 Mar 2017 16:27:46 +0000 Subject: afs: Migrate vlocation fields to 64-bit get_seconds() returns real wall-clock seconds. On 32-bit systems this value will overflow in year 2038 and beyond. This patch changes afs's vlocation record to use ktime_get_real_seconds() instead, for the fields time_of_death and update_at. Signed-off-by: Tina Ruchandani Signed-off-by: David Howells --- fs/afs/callback.c | 7 ++++--- fs/afs/internal.h | 7 ++++--- fs/afs/server.c | 6 +++--- fs/afs/vlocation.c | 16 +++++++++------- 4 files changed, 20 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/afs/callback.c b/fs/afs/callback.c index b29447e..25d404d 100644 --- a/fs/afs/callback.c +++ b/fs/afs/callback.c @@ -362,7 +362,7 @@ static void afs_callback_updater(struct work_struct *work) { struct afs_server *server; struct afs_vnode *vnode, *xvnode; - time_t now; + time64_t now; long timeout; int ret; @@ -370,7 +370,7 @@ static void afs_callback_updater(struct work_struct *work) _enter(""); - now = get_seconds(); + now = ktime_get_real_seconds(); /* find the first vnode to update */ spin_lock(&server->cb_lock); @@ -424,7 +424,8 @@ static void afs_callback_updater(struct work_struct *work) /* and then reschedule */ _debug("reschedule"); - vnode->update_at = get_seconds() + afs_vnode_update_timeout; + vnode->update_at = ktime_get_real_seconds() + + afs_vnode_update_timeout; spin_lock(&server->cb_lock); diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 39de154..97a16ce 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -249,7 +250,7 @@ struct afs_cache_vhash { */ struct afs_vlocation { atomic_t usage; - time_t time_of_death; /* time at which put reduced usage to 0 */ + time64_t time_of_death; /* time at which put reduced usage to 0 */ struct list_head link; /* link in cell volume location list */ struct list_head grave; /* link in master graveyard list */ struct list_head update; /* link in master update list */ @@ -260,7 +261,7 @@ struct afs_vlocation { struct afs_cache_vlocation vldb; /* volume information DB record */ struct afs_volume *vols[3]; /* volume access record pointer (index by type) */ wait_queue_head_t waitq; /* status change waitqueue */ - time_t update_at; /* time at which record should be updated */ + time64_t update_at; /* time at which record should be updated */ spinlock_t lock; /* access lock */ afs_vlocation_state_t state; /* volume location state */ unsigned short upd_rej_cnt; /* ENOMEDIUM count during update */ @@ -273,7 +274,7 @@ struct afs_vlocation { */ struct afs_server { atomic_t usage; - time_t time_of_death; /* time at which put reduced usage to 0 */ + time64_t time_of_death; /* time at which put reduced usage to 0 */ struct in_addr addr; /* server address */ struct afs_cell *cell; /* cell in which server resides */ struct list_head link; /* link in cell's server list */ diff --git a/fs/afs/server.c b/fs/afs/server.c index d4066ab..c001b1f 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -242,7 +242,7 @@ void afs_put_server(struct afs_server *server) spin_lock(&afs_server_graveyard_lock); if (atomic_read(&server->usage) == 0) { list_move_tail(&server->grave, &afs_server_graveyard); - server->time_of_death = get_seconds(); + server->time_of_death = ktime_get_real_seconds(); queue_delayed_work(afs_wq, &afs_server_reaper, afs_server_timeout * HZ); } @@ -277,9 +277,9 @@ static void afs_reap_server(struct work_struct *work) LIST_HEAD(corpses); struct afs_server *server; unsigned long delay, expiry; - time_t now; + time64_t now; - now = get_seconds(); + now = ktime_get_real_seconds(); spin_lock(&afs_server_graveyard_lock); while (!list_empty(&afs_server_graveyard)) { diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c index d7d8dd8..37b7c3b 100644 --- a/fs/afs/vlocation.c +++ b/fs/afs/vlocation.c @@ -340,7 +340,8 @@ static void afs_vlocation_queue_for_updates(struct afs_vlocation *vl) struct afs_vlocation *xvl; /* wait at least 10 minutes before updating... */ - vl->update_at = get_seconds() + afs_vlocation_update_timeout; + vl->update_at = ktime_get_real_seconds() + + afs_vlocation_update_timeout; spin_lock(&afs_vlocation_updates_lock); @@ -506,7 +507,7 @@ void afs_put_vlocation(struct afs_vlocation *vl) if (atomic_read(&vl->usage) == 0) { _debug("buried"); list_move_tail(&vl->grave, &afs_vlocation_graveyard); - vl->time_of_death = get_seconds(); + vl->time_of_death = ktime_get_real_seconds(); queue_delayed_work(afs_wq, &afs_vlocation_reap, afs_vlocation_timeout * HZ); @@ -543,11 +544,11 @@ static void afs_vlocation_reaper(struct work_struct *work) LIST_HEAD(corpses); struct afs_vlocation *vl; unsigned long delay, expiry; - time_t now; + time64_t now; _enter(""); - now = get_seconds(); + now = ktime_get_real_seconds(); spin_lock(&afs_vlocation_graveyard_lock); while (!list_empty(&afs_vlocation_graveyard)) { @@ -622,13 +623,13 @@ static void afs_vlocation_updater(struct work_struct *work) { struct afs_cache_vlocation vldb; struct afs_vlocation *vl, *xvl; - time_t now; + time64_t now; long timeout; int ret; _enter(""); - now = get_seconds(); + now = ktime_get_real_seconds(); /* find a record to update */ spin_lock(&afs_vlocation_updates_lock); @@ -684,7 +685,8 @@ static void afs_vlocation_updater(struct work_struct *work) /* and then reschedule */ _debug("reschedule"); - vl->update_at = get_seconds() + afs_vlocation_update_timeout; + vl->update_at = ktime_get_real_seconds() + + afs_vlocation_update_timeout; spin_lock(&afs_vlocation_updates_lock); -- cgit v1.1 From 56e714312e7dbd6bb83b2f78d3ec19a404c7649f Mon Sep 17 00:00:00 2001 From: Tina Ruchandani Date: Thu, 16 Mar 2017 16:27:46 +0000 Subject: afs: Prevent callback expiry timer overflow get_seconds() returns real wall-clock seconds. On 32-bit systems this value will overflow in year 2038 and beyond. This patch changes afs_vnode record to use ktime_get_real_seconds() instead, for the fields cb_expires and cb_expires_at. Signed-off-by: Tina Ruchandani Signed-off-by: David Howells --- fs/afs/fsclient.c | 2 +- fs/afs/inode.c | 7 ++++--- fs/afs/internal.h | 4 ++-- 3 files changed, 7 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 6f917dd..c05452a 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -145,7 +145,7 @@ static void xdr_decode_AFSCallBack(const __be32 **_bp, struct afs_vnode *vnode) vnode->cb_version = ntohl(*bp++); vnode->cb_expiry = ntohl(*bp++); vnode->cb_type = ntohl(*bp++); - vnode->cb_expires = vnode->cb_expiry + get_seconds(); + vnode->cb_expires = vnode->cb_expiry + ktime_get_real_seconds(); *_bp = bp; } diff --git a/fs/afs/inode.c b/fs/afs/inode.c index e083e08..4079c83 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -246,12 +246,13 @@ struct inode *afs_iget(struct super_block *sb, struct key *key, vnode->cb_version = 0; vnode->cb_expiry = 0; vnode->cb_type = 0; - vnode->cb_expires = get_seconds(); + vnode->cb_expires = ktime_get_real_seconds(); } else { vnode->cb_version = cb->version; vnode->cb_expiry = cb->expiry; vnode->cb_type = cb->type; - vnode->cb_expires = vnode->cb_expiry + get_seconds(); + vnode->cb_expires = vnode->cb_expiry + + ktime_get_real_seconds(); } } @@ -324,7 +325,7 @@ int afs_validate(struct afs_vnode *vnode, struct key *key) !test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags) && !test_bit(AFS_VNODE_MODIFIED, &vnode->flags) && !test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) { - if (vnode->cb_expires < get_seconds() + 10) { + if (vnode->cb_expires < ktime_get_real_seconds() + 10) { _debug("callback expired"); set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags); } else { diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 97a16ce..8325550 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -377,8 +377,8 @@ struct afs_vnode { struct rb_node server_rb; /* link in server->fs_vnodes */ struct rb_node cb_promise; /* link in server->cb_promises */ struct work_struct cb_broken_work; /* work to be done on callback break */ - time_t cb_expires; /* time at which callback expires */ - time_t cb_expires_at; /* time used to order cb_promise */ + time64_t cb_expires; /* time at which callback expires */ + time64_t cb_expires_at; /* time used to order cb_promise */ unsigned cb_version; /* callback version */ unsigned cb_expiry; /* callback expiry time */ afs_callback_type_t cb_type; /* type of callback */ -- cgit v1.1 From 29f069853287dcb46eaf45a50dbf1232c1444ac6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Mar 2017 16:27:46 +0000 Subject: afs: Fix AFS read bug Fix a bug in AFS read whereby the request page afs_read::index isn't incremented after calling ->page_done() if ->remain reaches 0, indicating that the data read is complete. Without this a NULL pointer exception happens when ->page_done() is called twice for the last page because the page clearing loop will call it also and afs_readpages_page_done() clears the current entry in the page list. BUG: unable to handle kernel NULL pointer dereference at (null) IP: afs_readpages_page_done+0x21/0xa4 [kafs] PGD 0 Oops: 0002 [#1] SMP Modules linked in: kafs(E) CPU: 2 PID: 3002 Comm: md5sum Tainted: G E 4.10.0-fscache #485 Hardware name: ASUS All Series/H97-PLUS, BIOS 2306 10/09/2014 task: ffff8804017d86c0 task.stack: ffff8803fc1d8000 RIP: 0010:afs_readpages_page_done+0x21/0xa4 [kafs] RSP: 0018:ffff8803fc1db978 EFLAGS: 00010282 RAX: ffff880405d39af8 RBX: 0000000000000000 RCX: ffff880407d83ed4 RDX: 0000000000000000 RSI: ffff880405d39a00 RDI: ffff880405c6f400 RBP: ffff8803fc1db988 R08: 0000000000000000 R09: 0000000000000001 R10: ffff8803fc1db820 R11: ffff88040cf56000 R12: ffff8804088f1780 R13: ffff8804017d86c0 R14: ffff8804088f1780 R15: 0000000000003840 FS: 00007f8154469700(0000) GS:ffff88041fb00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 00000004016ec000 CR4: 00000000001406e0 Call Trace: afs_deliver_fs_fetch_data+0x5b9/0x60e [kafs] ? afs_make_call+0x316/0x4e8 [kafs] ? afs_make_call+0x359/0x4e8 [kafs] afs_deliver_to_call+0x173/0x2e8 [kafs] ? afs_make_call+0x316/0x4e8 [kafs] afs_make_call+0x37a/0x4e8 [kafs] ? wake_up_q+0x4f/0x4f ? __init_waitqueue_head+0x36/0x49 afs_fs_fetch_data+0x21c/0x227 [kafs] ? afs_fs_fetch_data+0x21c/0x227 [kafs] afs_vnode_fetch_data+0xf3/0x1d2 [kafs] afs_readpages+0x314/0x3fd [kafs] __do_page_cache_readahead+0x208/0x2c5 ondemand_readahead+0x3a2/0x3b7 ? ondemand_readahead+0x3a2/0x3b7 page_cache_async_readahead+0x5e/0x67 generic_file_read_iter+0x23b/0x70c ? __inode_security_revalidate+0x2f/0x62 __vfs_read+0xc4/0xe8 vfs_read+0xd1/0x15a SyS_read+0x4c/0x89 do_syscall_64+0x80/0x191 entry_SYSCALL64_slow_path+0x25/0x25 Reported-by: Marc Dionne Signed-off-by: David Howells Tested-by: Marc Dionne --- fs/afs/fsclient.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index c05452a..4314f9e 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -390,9 +390,9 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) if (call->offset == PAGE_SIZE) { if (req->page_done) req->page_done(call, req); + req->index++; if (req->remain > 0) { call->offset = 0; - req->index++; if (req->index >= req->nr_pages) { call->unmarshall = 4; goto begin_discard; -- cgit v1.1 From 6a0e3999e5cb3daa0468073fcdee0767422a4056 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Mar 2017 16:27:46 +0000 Subject: afs: Make struct afs_read::remain 64-bit Make struct afs_read::remain 64-bit so that it can handle huge transfers if we ever request them or the server decides to give us a bit extra data (the other fields there are already 64-bit). Signed-off-by: David Howells Tested-by: Marc Dionne --- fs/afs/fsclient.c | 8 ++++---- fs/afs/internal.h | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 4314f9e..0778c5b 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -321,7 +321,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) void *buffer; int ret; - _enter("{%u,%zu/%u;%u/%llu}", + _enter("{%u,%zu/%u;%llu/%llu}", call->unmarshall, call->offset, call->count, req->remain, req->actual_len); @@ -379,7 +379,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) /* extract the returned data */ case 3: - _debug("extract data %u/%llu %zu/%u", + _debug("extract data %llu/%llu %zu/%u", req->remain, req->actual_len, call->offset, call->count); buffer = kmap(req->pages[req->index]); @@ -405,9 +405,9 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) /* Discard any excess data the server gave us */ begin_discard: case 4: - size = min_t(size_t, sizeof(afs_discard_buffer), req->remain); + size = min_t(loff_t, sizeof(afs_discard_buffer), req->remain); call->count = size; - _debug("extract discard %u/%llu %zu/%u", + _debug("extract discard %llu/%llu %zu/%u", req->remain, req->actual_len, call->offset, call->count); call->offset = 0; diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 8325550..a690136 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -133,8 +133,8 @@ struct afs_read { loff_t pos; /* Where to start reading */ loff_t len; /* How much we're asking for */ loff_t actual_len; /* How much we're actually getting */ + loff_t remain; /* Amount remaining */ atomic_t usage; - unsigned int remain; /* Amount remaining */ unsigned int index; /* Which page we're reading into */ unsigned int nr_pages; void (*page_done)(struct afs_call *, struct afs_read *); -- cgit v1.1 From 2f5705a5c805e7f761f2228820656bb9363a3d8c Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Mar 2017 16:27:46 +0000 Subject: afs: Use a bvec rather than a kvec in afs_send_pages() Use a bvec rather than a kvec in afs_send_pages() as we don't then have to call kmap() in advance. This allows us to pass the array of contiguous pages that we extracted through to rxrpc in one go rather than passing a single page at a time. Signed-off-by: David Howells --- fs/afs/rxrpc.c | 97 +++++++++++++++++++++++++++++++--------------------------- 1 file changed, 52 insertions(+), 45 deletions(-) (limited to 'fs') diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 419ef05..bf45307 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -259,67 +259,74 @@ void afs_flat_call_destructor(struct afs_call *call) call->buffer = NULL; } +#define AFS_BVEC_MAX 8 + +/* + * Load the given bvec with the next few pages. + */ +static void afs_load_bvec(struct afs_call *call, struct msghdr *msg, + struct bio_vec *bv, pgoff_t first, pgoff_t last, + unsigned offset) +{ + struct page *pages[AFS_BVEC_MAX]; + unsigned int nr, n, i, to, bytes = 0; + + nr = min_t(pgoff_t, last - first + 1, AFS_BVEC_MAX); + n = find_get_pages_contig(call->mapping, first, nr, pages); + ASSERTCMP(n, ==, nr); + + msg->msg_flags |= MSG_MORE; + for (i = 0; i < nr; i++) { + to = PAGE_SIZE; + if (first + i >= last) { + to = call->last_to; + msg->msg_flags &= ~MSG_MORE; + } + bv[i].bv_page = pages[i]; + bv[i].bv_len = to - offset; + bv[i].bv_offset = offset; + bytes += to - offset; + offset = 0; + } + + iov_iter_bvec(&msg->msg_iter, WRITE | ITER_BVEC, bv, nr, bytes); +} + /* * attach the data from a bunch of pages on an inode to a call */ static int afs_send_pages(struct afs_call *call, struct msghdr *msg) { - struct page *pages[8]; - unsigned count, n, loop, offset, to; + struct bio_vec bv[AFS_BVEC_MAX]; + unsigned int bytes, nr, loop, offset; pgoff_t first = call->first, last = call->last; int ret; - _enter(""); - offset = call->first_offset; call->first_offset = 0; do { - _debug("attach %lx-%lx", first, last); - - count = last - first + 1; - if (count > ARRAY_SIZE(pages)) - count = ARRAY_SIZE(pages); - n = find_get_pages_contig(call->mapping, first, count, pages); - ASSERTCMP(n, ==, count); - - loop = 0; - do { - struct bio_vec bvec = {.bv_page = pages[loop], - .bv_offset = offset}; - msg->msg_flags = 0; - to = PAGE_SIZE; - if (first + loop >= last) - to = call->last_to; - else - msg->msg_flags = MSG_MORE; - bvec.bv_len = to - offset; - offset = 0; - - _debug("- range %u-%u%s", - offset, to, msg->msg_flags ? " [more]" : ""); - iov_iter_bvec(&msg->msg_iter, WRITE | ITER_BVEC, - &bvec, 1, to - offset); - - /* have to change the state *before* sending the last - * packet as RxRPC might give us the reply before it - * returns from sending the request */ - if (first + loop >= last) - call->state = AFS_CALL_AWAIT_REPLY; - ret = rxrpc_kernel_send_data(afs_socket, call->rxcall, - msg, to - offset); - if (ret < 0) - break; - } while (++loop < count); - first += count; - - for (loop = 0; loop < count; loop++) - put_page(pages[loop]); + afs_load_bvec(call, msg, bv, first, last, offset); + offset = 0; + bytes = msg->msg_iter.count; + nr = msg->msg_iter.nr_segs; + + /* Have to change the state *before* sending the last + * packet as RxRPC might give us the reply before it + * returns from sending the request. + */ + if (first + nr >= last) + call->state = AFS_CALL_AWAIT_REPLY; + ret = rxrpc_kernel_send_data(afs_socket, call->rxcall, + msg, bytes); + for (loop = 0; loop < nr; loop++) + put_page(bv[loop].bv_page); if (ret < 0) break; + + first += nr; } while (first <= last); - _leave(" = %d", ret); return ret; } -- cgit v1.1 From 146a1192783697810b63a1e41c4d59fc93387340 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Mar 2017 16:27:47 +0000 Subject: afs: Fix the maths in afs_fs_store_data() afs_fs_store_data() works out of the size of the write it's going to make, but it uses 32-bit unsigned subtraction in one place that gets automatically cast to loff_t. However, if to < offset, then the number goes negative, but as the result isn't signed, this doesn't get sign-extended to 64-bits when placed in a loff_t. Fix by casting the operands to loff_t. Signed-off-by: David Howells --- fs/afs/fsclient.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 0778c5b..d9234b7 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -1236,7 +1236,7 @@ int afs_fs_store_data(struct afs_server *server, struct afs_writeback *wb, _enter(",%x,{%x:%u},,", key_serial(wb->key), vnode->fid.vid, vnode->fid.vnode); - size = to - offset; + size = (loff_t)to - (loff_t)offset; if (first != last) size += (loff_t)(last - first) << PAGE_SHIFT; pos = (loff_t)first << PAGE_SHIFT; -- cgit v1.1 From 1157f153f37a8586765034470e4f00a4a6c4ce6f Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Mar 2017 16:27:47 +0000 Subject: afs: Invalid op ID should abort with RXGEN_OPCODE When we are given an invalid operation ID, we should abort that with RXGEN_OPCODE rather than RX_INVALID_OPERATION. Also map RXGEN_OPCODE to -ENOTSUPP. Signed-off-by: David Howells --- fs/afs/misc.c | 2 ++ fs/afs/rxrpc.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/afs/misc.c b/fs/afs/misc.c index 91ea1aa..100b207 100644 --- a/fs/afs/misc.c +++ b/fs/afs/misc.c @@ -84,6 +84,8 @@ int afs_abort_to_error(u32 abort_code) case RXKADDATALEN: return -EKEYREJECTED; case RXKADILLEGALLEVEL: return -EKEYREJECTED; + case RXGEN_OPCODE: return -ENOTSUPP; + default: return -EREMOTEIO; } } diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index bf45307..bf7761f 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -465,7 +465,7 @@ static void afs_deliver_to_call(struct afs_call *call) abort_code, -ret, "KNC"); goto do_abort; case -ENOTSUPP: - abort_code = RX_INVALID_OPERATION; + abort_code = RXGEN_OPCODE; rxrpc_kernel_abort_call(afs_socket, call->rxcall, abort_code, -ret, "KIV"); goto do_abort; -- cgit v1.1 From 70af0e3bd65142f9e674961c975451638a7ce1d5 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Mar 2017 16:27:47 +0000 Subject: afs: Better abort and net error handling If we receive a network error, a remote abort or a protocol error whilst we're still transmitting data, make sure we return an appropriate error to the caller rather than ESHUTDOWN or ECONNABORTED. Signed-off-by: David Howells --- fs/afs/rxrpc.c | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index bf7761f..22d26b3 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -340,6 +340,8 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, struct rxrpc_call *rxcall; struct msghdr msg; struct kvec iov[1]; + size_t offset; + u32 abort_code; int ret; _enter("%x,{%d},", addr->s_addr, ntohs(call->port)); @@ -388,9 +390,11 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, msg.msg_controllen = 0; msg.msg_flags = (call->send_pages ? MSG_MORE : 0); - /* have to change the state *before* sending the last packet as RxRPC - * might give us the reply before it returns from sending the - * request */ + /* We have to change the state *before* sending the last packet as + * rxrpc might give us the reply before it returns from sending the + * request. Further, if the send fails, we may already have been given + * a notification and may have collected it. + */ if (!call->send_pages) call->state = AFS_CALL_AWAIT_REPLY; ret = rxrpc_kernel_send_data(afs_socket, rxcall, @@ -412,7 +416,17 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, return afs_wait_for_call_to_complete(call); error_do_abort: - rxrpc_kernel_abort_call(afs_socket, rxcall, RX_USER_ABORT, -ret, "KSD"); + call->state = AFS_CALL_COMPLETE; + if (ret != -ECONNABORTED) { + rxrpc_kernel_abort_call(afs_socket, rxcall, RX_USER_ABORT, + -ret, "KSD"); + } else { + abort_code = 0; + offset = 0; + rxrpc_kernel_recv_data(afs_socket, rxcall, NULL, 0, &offset, + false, &abort_code); + ret = call->type->abort_to_error(abort_code); + } error_kill_call: afs_put_call(call); _leave(" = %d", ret); @@ -459,16 +473,18 @@ static void afs_deliver_to_call(struct afs_call *call) case -EINPROGRESS: case -EAGAIN: goto out; + case -ECONNABORTED: + goto call_complete; case -ENOTCONN: abort_code = RX_CALL_DEAD; rxrpc_kernel_abort_call(afs_socket, call->rxcall, abort_code, -ret, "KNC"); - goto do_abort; + goto save_error; case -ENOTSUPP: abort_code = RXGEN_OPCODE; rxrpc_kernel_abort_call(afs_socket, call->rxcall, abort_code, -ret, "KIV"); - goto do_abort; + goto save_error; case -ENODATA: case -EBADMSG: case -EMSGSIZE: @@ -478,7 +494,7 @@ static void afs_deliver_to_call(struct afs_call *call) abort_code = RXGEN_SS_UNMARSHAL; rxrpc_kernel_abort_call(afs_socket, call->rxcall, abort_code, EBADMSG, "KUM"); - goto do_abort; + goto save_error; } } @@ -489,8 +505,9 @@ out: _leave(""); return; -do_abort: +save_error: call->error = ret; +call_complete: call->state = AFS_CALL_COMPLETE; goto done; } @@ -538,6 +555,8 @@ static int afs_wait_for_call_to_complete(struct afs_call *call) _debug("call incomplete"); rxrpc_kernel_abort_call(afs_socket, call->rxcall, RX_CALL_DEAD, -ret, abort_why); + } else if (call->error < 0) { + ret = call->error; } _debug("call complete"); -- cgit v1.1 From ab94f5d0dd6fd82e7eeca5e7c8096eaea0a0261f Mon Sep 17 00:00:00 2001 From: Marc Dionne Date: Thu, 16 Mar 2017 16:27:47 +0000 Subject: afs: Populate and use client modification time The inode timestamps should be set from the client time in the status received from the server, rather than the server time which is meant for internal server use. Set AFS_SET_MTIME and populate the mtime for operations that take an input status, such as file/dir creation and StoreData. If an input time is not provided the server will set the vnode times based on the current server time. In a situation where the server has some skew with the client, this could lead to the client seeing a timestamp in the future for a file that it just created or wrote. Signed-off-by: Marc Dionne Signed-off-by: David Howells --- fs/afs/fsclient.c | 18 +++++++++--------- fs/afs/inode.c | 2 +- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index d9234b7..19f76ae 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -111,7 +111,7 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp, vnode->vfs_inode.i_mode = mode; } - vnode->vfs_inode.i_ctime.tv_sec = status->mtime_server; + vnode->vfs_inode.i_ctime.tv_sec = status->mtime_client; vnode->vfs_inode.i_mtime = vnode->vfs_inode.i_ctime; vnode->vfs_inode.i_atime = vnode->vfs_inode.i_ctime; vnode->vfs_inode.i_version = data_version; @@ -734,8 +734,8 @@ int afs_fs_create(struct afs_server *server, memset(bp, 0, padsz); bp = (void *) bp + padsz; } - *bp++ = htonl(AFS_SET_MODE); - *bp++ = 0; /* mtime */ + *bp++ = htonl(AFS_SET_MODE | AFS_SET_MTIME); + *bp++ = htonl(vnode->vfs_inode.i_mtime.tv_sec); /* mtime */ *bp++ = 0; /* owner */ *bp++ = 0; /* group */ *bp++ = htonl(mode & S_IALLUGO); /* unix mode */ @@ -1003,8 +1003,8 @@ int afs_fs_symlink(struct afs_server *server, memset(bp, 0, c_padsz); bp = (void *) bp + c_padsz; } - *bp++ = htonl(AFS_SET_MODE); - *bp++ = 0; /* mtime */ + *bp++ = htonl(AFS_SET_MODE | AFS_SET_MTIME); + *bp++ = htonl(vnode->vfs_inode.i_mtime.tv_sec); /* mtime */ *bp++ = 0; /* owner */ *bp++ = 0; /* group */ *bp++ = htonl(S_IRWXUGO); /* unix mode */ @@ -1203,8 +1203,8 @@ static int afs_fs_store_data64(struct afs_server *server, *bp++ = htonl(vnode->fid.vnode); *bp++ = htonl(vnode->fid.unique); - *bp++ = 0; /* mask */ - *bp++ = 0; /* mtime */ + *bp++ = htonl(AFS_SET_MTIME); /* mask */ + *bp++ = htonl(vnode->vfs_inode.i_mtime.tv_sec); /* mtime */ *bp++ = 0; /* owner */ *bp++ = 0; /* group */ *bp++ = 0; /* unix mode */ @@ -1280,8 +1280,8 @@ int afs_fs_store_data(struct afs_server *server, struct afs_writeback *wb, *bp++ = htonl(vnode->fid.vnode); *bp++ = htonl(vnode->fid.unique); - *bp++ = 0; /* mask */ - *bp++ = 0; /* mtime */ + *bp++ = htonl(AFS_SET_MTIME); /* mask */ + *bp++ = htonl(vnode->vfs_inode.i_mtime.tv_sec); /* mtime */ *bp++ = 0; /* owner */ *bp++ = 0; /* group */ *bp++ = 0; /* unix mode */ diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 4079c83..aae55dd 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -85,7 +85,7 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key) inode->i_uid = vnode->status.owner; inode->i_gid = vnode->status.group; inode->i_size = vnode->status.size; - inode->i_ctime.tv_sec = vnode->status.mtime_server; + inode->i_ctime.tv_sec = vnode->status.mtime_client; inode->i_ctime.tv_nsec = 0; inode->i_atime = inode->i_mtime = inode->i_ctime; inode->i_blocks = 0; -- cgit v1.1 From 68ae849d7e674b83610bc7fdf74b21621a09b9ac Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Mar 2017 16:27:48 +0000 Subject: afs: Don't set PG_error on local EINTR or ENOMEM when filling a page Don't set PG_error on a page if we get local EINTR or ENOMEM when filling a page for writing. Signed-off-by: David Howells --- fs/afs/file.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/afs/file.c b/fs/afs/file.c index b582944..0d5b850 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -212,7 +212,13 @@ int afs_page_filler(void *data, struct page *page) fscache_uncache_page(vnode->cache, page); #endif BUG_ON(PageFsCache(page)); - goto error; + + if (ret == -EINTR || + ret == -ENOMEM || + ret == -ERESTARTSYS || + ret == -EAGAIN) + goto error; + goto io_error; } SetPageUptodate(page); @@ -231,10 +237,12 @@ int afs_page_filler(void *data, struct page *page) _leave(" = 0"); return 0; +io_error: + SetPageError(page); + goto error; enomem: ret = -ENOMEM; error: - SetPageError(page); unlock_page(page); _leave(" = %d", ret); return ret; -- cgit v1.1 From 6d06b0d25209c80e99c1e89700f1e09694a3766b Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Mar 2017 16:27:48 +0000 Subject: afs: Fix page leak in afs_write_begin() afs_write_begin() leaks a ref and a lock on a page if afs_fill_page() fails. Fix the leak by unlocking and releasing the page in the error path. Signed-off-by: David Howells --- fs/afs/write.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/afs/write.c b/fs/afs/write.c index f1450ea..6e13e96 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -154,12 +154,12 @@ int afs_write_begin(struct file *file, struct address_space *mapping, kfree(candidate); return -ENOMEM; } - *pagep = page; - /* page won't leak in error case: it eventually gets cleaned off LRU */ if (!PageUptodate(page) && len != PAGE_SIZE) { ret = afs_fill_page(vnode, key, pos & PAGE_MASK, PAGE_SIZE, page); if (ret < 0) { + unlock_page(page); + put_page(page); kfree(candidate); _leave(" = %d [prep]", ret); return ret; @@ -167,6 +167,9 @@ int afs_write_begin(struct file *file, struct address_space *mapping, SetPageUptodate(page); } + /* page won't leak in error case: it eventually gets cleaned off LRU */ + *pagep = page; + try_again: spin_lock(&vnode->writeback_lock); -- cgit v1.1 From 7286a35e893176169b09715096a4aca557e2ccd2 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Mar 2017 16:27:48 +0000 Subject: afs: Fix afs_kill_pages() Fix afs_kill_pages() in two ways: (1) If a writeback has been partially flushed, then if we try and kill the pages it contains, some of them may no longer be undergoing writeback and end_page_writeback() will assert. Fix this by checking to see whether the page in question is actually undergoing writeback before ending that writeback. (2) The loop that scans for pages to kill doesn't increase the first page index, and so the loop may not terminate, but it will try to process the same pages over and over again. Fix this by increasing the first page index to one after the last page we processed. Signed-off-by: David Howells --- fs/afs/write.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/afs/write.c b/fs/afs/write.c index 6e13e96..134de06 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -321,10 +321,14 @@ static void afs_kill_pages(struct afs_vnode *vnode, bool error, ASSERTCMP(pv.nr, ==, count); for (loop = 0; loop < count; loop++) { - ClearPageUptodate(pv.pages[loop]); + struct page *page = pv.pages[loop]; + ClearPageUptodate(page); if (error) - SetPageError(pv.pages[loop]); - end_page_writeback(pv.pages[loop]); + SetPageError(page); + if (PageWriteback(page)) + end_page_writeback(page); + if (page->index >= first) + first = page->index + 1; } __pagevec_release(&pv); -- cgit v1.1 From 445783d0ec173a52bef2e9b129de7d716a19b9fa Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Mar 2017 16:27:48 +0000 Subject: afs: Fix an off-by-one error in afs_send_pages() afs_send_pages() should only put the call into the AFS_CALL_AWAIT_REPLY state if it has sent all the pages - but the check it makes is incorrect and sometimes it will finish the loop early. Signed-off-by: David Howells --- fs/afs/rxrpc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 22d26b3..b12da6a 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -315,7 +315,7 @@ static int afs_send_pages(struct afs_call *call, struct msghdr *msg) * packet as RxRPC might give us the reply before it * returns from sending the request. */ - if (first + nr >= last) + if (first + nr - 1 >= last) call->state = AFS_CALL_AWAIT_REPLY; ret = rxrpc_kernel_send_data(afs_socket, call->rxcall, msg, bytes); -- cgit v1.1 From 954cd6dc02a65065aecb7150962c0870c5b0e322 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Mar 2017 16:27:49 +0000 Subject: afs: Fix abort on signal while waiting for call completion Fix the way in which a call that's in progress and being waited for is aborted in the case that EINTR is detected. We should be sending RX_USER_ABORT rather than RX_CALL_DEAD as the abort code. Note that since the only two ways out of the loop are if the call completes or if a signal happens, the kill-the-call clause after the loop has finished can only happen in the case of EINTR. This means that we only have one abort case to deal with, not two, and the "KWC" case can never happen and so can be deleted. Note further that simply aborting the call isn't necessarily the best thing here since at this point: the request has been entirely sent and it's likely the server will do the operation anyway - whether we abort it or not. In future, we should punt the handling of the remainder of the call off to a background thread. Reported-by: Marc Dionne Signed-off-by: David Howells --- fs/afs/rxrpc.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index b12da6a..8f76b13 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -517,7 +517,6 @@ call_complete: */ static int afs_wait_for_call_to_complete(struct afs_call *call) { - const char *abort_why; int ret; DECLARE_WAITQUEUE(myself, current); @@ -536,13 +535,8 @@ static int afs_wait_for_call_to_complete(struct afs_call *call) continue; } - abort_why = "KWC"; - ret = call->error; - if (call->state == AFS_CALL_COMPLETE) - break; - abort_why = "KWI"; - ret = -EINTR; - if (signal_pending(current)) + if (call->state == AFS_CALL_COMPLETE || + signal_pending(current)) break; schedule(); } @@ -550,15 +544,14 @@ static int afs_wait_for_call_to_complete(struct afs_call *call) remove_wait_queue(&call->waitq, &myself); __set_current_state(TASK_RUNNING); - /* kill the call */ + /* Kill off the call if it's still live. */ if (call->state < AFS_CALL_COMPLETE) { - _debug("call incomplete"); + _debug("call interrupted"); rxrpc_kernel_abort_call(afs_socket, call->rxcall, - RX_CALL_DEAD, -ret, abort_why); - } else if (call->error < 0) { - ret = call->error; + RX_USER_ABORT, -EINTR, "KWI"); } + ret = call->error; _debug("call complete"); afs_put_call(call); _leave(" = %d", ret); -- cgit v1.1 From 65a151094edeb04e8f5f6f1502028e2383e81bb8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Mar 2017 16:27:49 +0000 Subject: afs: ->writepage() shouldn't call clear_page_dirty_for_io() The ->writepage() op shouldn't call clear_page_dirty_for_io() as that has already been called by the caller. Fix afs_writepage() by moving the call out of afs_write_back_from_locked_page() to afs_writepages_region() where it is needed. Signed-off-by: David Howells --- fs/afs/write.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/afs/write.c b/fs/afs/write.c index 134de06..e5f150b 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -231,7 +231,7 @@ flush_conflicting_wb: if (wb->state == AFS_WBACK_PENDING) wb->state = AFS_WBACK_CONFLICTING; spin_unlock(&vnode->writeback_lock); - if (PageDirty(page)) { + if (clear_page_dirty_for_io(page)) { ret = afs_write_back_from_locked_page(wb, page); if (ret < 0) { afs_put_writeback(candidate); @@ -353,8 +353,6 @@ static int afs_write_back_from_locked_page(struct afs_writeback *wb, _enter(",%lx", primary_page->index); count = 1; - if (!clear_page_dirty_for_io(primary_page)) - BUG(); if (test_set_page_writeback(primary_page)) BUG(); @@ -542,6 +540,8 @@ static int afs_writepages_region(struct address_space *mapping, wb->state = AFS_WBACK_WRITING; spin_unlock(&wb->vnode->writeback_lock); + if (!clear_page_dirty_for_io(page)) + BUG(); ret = afs_write_back_from_locked_page(wb, page); unlock_page(page); put_page(page); -- cgit v1.1 From c5051c7bc777dffa5661569dec5997f432b9a34a Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Mar 2017 16:27:49 +0000 Subject: afs: Don't wait for page writeback with the page lock held Drop the page lock before waiting for page writeback. Signed-off-by: David Howells --- fs/afs/write.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/afs/write.c b/fs/afs/write.c index e5f150b..2d2fccd 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -518,17 +518,16 @@ static int afs_writepages_region(struct address_space *mapping, */ lock_page(page); - if (page->mapping != mapping) { + if (page->mapping != mapping || !PageDirty(page)) { unlock_page(page); put_page(page); continue; } - if (wbc->sync_mode != WB_SYNC_NONE) - wait_on_page_writeback(page); - - if (PageWriteback(page) || !PageDirty(page)) { + if (PageWriteback(page)) { unlock_page(page); + if (wbc->sync_mode != WB_SYNC_NONE) + wait_on_page_writeback(page); put_page(page); continue; } -- cgit v1.1 From 38a33101dd5fb8f07244e7e6dd04747a6cd2e3fb Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Thu, 9 Mar 2017 11:36:36 +0800 Subject: NFS: fix the fault nrequests decreasing for nfs_inode COPY The nfs_commit_file for NFSv4.2's COPY operation goes through the commit path for normal WRITE, but without increase nrequests, so, the nrequests decreased in nfs_commit_release_pages is fault. After that, the nrequests will be wrong. [ 5670.299881] ------------[ cut here ]------------ [ 5670.300295] WARNING: CPU: 0 PID: 27656 at fs/nfs/inode.c:127 nfs_clear_inode+0x66/0x90 [nfs] [ 5670.300558] Modules linked in: nfsv4(E) nfs(E) fscache(E) tun bridge stp llc fuse ip_set nfnetlink vmw_vsock_vmci_transport vsock snd_seq_midi snd_seq_midi_event ppdev f2fs coretemp crct10dif_pclmul crc32_pclmul ghash_clmulni_intel snd_ens1371 intel_rapl_perf gameport snd_ac97_codec vmw_balloon ac97_bus snd_seq snd_pcm joydev snd_rawmidi snd_timer snd_seq_device snd soundcore nfit parport_pc parport acpi_cpufreq tpm_tis tpm_tis_core tpm i2c_piix4 vmw_vmci shpchp nfsd auth_rpcgss nfs_acl lockd grace sunrpc xfs libcrc32c vmwgfx drm_kms_helper ttm drm e1000 crc32c_intel mptspi scsi_transport_spi serio_raw mptscsih mptbase ata_generic pata_acpi fjes [last unloaded: fscache] [ 5670.302925] CPU: 0 PID: 27656 Comm: umount.nfs4 Tainted: G W E 4.11.0-rc1+ #519 [ 5670.303292] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/02/2015 [ 5670.304094] Call Trace: [ 5670.304510] dump_stack+0x63/0x86 [ 5670.304917] __warn+0xcb/0xf0 [ 5670.305276] warn_slowpath_null+0x1d/0x20 [ 5670.305661] nfs_clear_inode+0x66/0x90 [nfs] [ 5670.306093] nfs4_evict_inode+0x61/0x70 [nfsv4] [ 5670.306480] evict+0xbb/0x1c0 [ 5670.306888] dispose_list+0x4d/0x70 [ 5670.307233] evict_inodes+0x178/0x1a0 [ 5670.307579] generic_shutdown_super+0x44/0xf0 [ 5670.307985] nfs_kill_super+0x21/0x40 [nfs] [ 5670.308325] deactivate_locked_super+0x43/0x70 [ 5670.308698] deactivate_super+0x5a/0x60 [ 5670.309036] cleanup_mnt+0x3f/0x90 [ 5670.309407] __cleanup_mnt+0x12/0x20 [ 5670.309837] task_work_run+0x80/0xa0 [ 5670.310162] exit_to_usermode_loop+0x89/0x90 [ 5670.310497] syscall_return_slowpath+0xaa/0xb0 [ 5670.310875] entry_SYSCALL_64_fastpath+0xa7/0xa9 [ 5670.311197] RIP: 0033:0x7f1bb3617fe7 [ 5670.311545] RSP: 002b:00007ffecbabb828 EFLAGS: 00000206 ORIG_RAX: 00000000000000a6 [ 5670.311906] RAX: 0000000000000000 RBX: 0000000001dca1f0 RCX: 00007f1bb3617fe7 [ 5670.312239] RDX: 000000000000000c RSI: 0000000000000001 RDI: 0000000001dc83c0 [ 5670.312653] RBP: 0000000001dc83c0 R08: 0000000000000001 R09: 0000000000000000 [ 5670.312998] R10: 0000000000000755 R11: 0000000000000206 R12: 00007ffecbabc66a [ 5670.313335] R13: 0000000001dc83a0 R14: 0000000000000000 R15: 0000000000000000 [ 5670.313758] ---[ end trace bf4bfe7764e4eb40 ]--- Cc: linux-kernel@vger.kernel.org Fixes: 67911c8f18 ("NFS: Add nfs_commit_file()") Signed-off-by: Kinglong Mee Cc: stable@vger.kernel.org # 4.7+ Signed-off-by: Anna Schumaker --- fs/nfs/write.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index e75b056..abb2c8a 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1784,7 +1784,8 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) (long long)req_offset(req)); if (status < 0) { nfs_context_set_write_error(req->wb_context, status); - nfs_inode_remove_request(req); + if (req->wb_page) + nfs_inode_remove_request(req); dprintk_cont(", error = %d\n", status); goto next; } @@ -1793,7 +1794,8 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) * returned by the server against all stored verfs. */ if (!nfs_write_verifier_cmp(&req->wb_verf, &data->verf.verifier)) { /* We have a match */ - nfs_inode_remove_request(req); + if (req->wb_page) + nfs_inode_remove_request(req); dprintk_cont(" OK\n"); goto next; } -- cgit v1.1 From 05fae7bbc237bc7de0ee9c3dcf85b2572a80e3b5 Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Fri, 10 Mar 2017 10:48:13 +0800 Subject: nfs: make nfs4_cb_sv_ops static Fixes the following sparse warning: fs/nfs/callback.c:235:21: warning: symbol 'nfs4_cb_sv_ops' was not declared. Should it be static? Signed-off-by: Jason Yan Signed-off-by: Anna Schumaker --- fs/nfs/callback.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 484bebc..5c8a096 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -231,12 +231,12 @@ static struct svc_serv_ops nfs41_cb_sv_ops = { .svo_module = THIS_MODULE, }; -struct svc_serv_ops *nfs4_cb_sv_ops[] = { +static struct svc_serv_ops *nfs4_cb_sv_ops[] = { [0] = &nfs40_cb_sv_ops, [1] = &nfs41_cb_sv_ops, }; #else -struct svc_serv_ops *nfs4_cb_sv_ops[] = { +static struct svc_serv_ops *nfs4_cb_sv_ops[] = { [0] = &nfs40_cb_sv_ops, [1] = NULL, }; -- cgit v1.1 From 63513232f8cd219dcaa5eafae028740ed3067d83 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Mon, 13 Mar 2017 10:36:19 -0400 Subject: NFS prevent double free in async nfs4_exchange_id Since rpc_task is async, the release function should be called which will free the impl_id, scope, and owner. Trond pointed at 2 more problems: -- use of client pointer after free in the nfs4_exchangeid_release() function -- cl_count mismatch if rpc_run_task() isn't run Fixes: 8d89bd70bc9 ("NFS setup async exchange_id") Signed-off-by: Olga Kornievskaia Cc: stable@vger.kernel.org # 4.9 Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index c1f5369..c780d98 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -7425,11 +7425,11 @@ static void nfs4_exchange_id_release(void *data) struct nfs41_exchange_id_data *cdata = (struct nfs41_exchange_id_data *)data; - nfs_put_client(cdata->args.client); if (cdata->xprt) { xprt_put(cdata->xprt); rpc_clnt_xprt_switch_put(cdata->args.client->cl_rpcclient); } + nfs_put_client(cdata->args.client); kfree(cdata->res.impl_id); kfree(cdata->res.server_scope); kfree(cdata->res.server_owner); @@ -7536,10 +7536,8 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, task_setup_data.callback_data = calldata; task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) { - status = PTR_ERR(task); - goto out_impl_id; - } + if (IS_ERR(task)) + return PTR_ERR(task); if (!xprt) { status = rpc_wait_for_completion_task(task); @@ -7567,6 +7565,7 @@ out_server_owner: kfree(calldata->res.server_owner); out_calldata: kfree(calldata); + nfs_put_client(clp); goto out; } -- cgit v1.1 From 033853325fe3bdc70819a8b97915bd3bca41d3af Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Wed, 8 Mar 2017 14:39:15 -0500 Subject: NFSv4.1 respect server's max size in CREATE_SESSION Currently client doesn't respect max sizes server returns in CREATE_SESSION. nfs4_session_set_rwsize() gets called and server->rsize, server->wsize are 0 so they never get set to the sizes returned by the server. Signed-off-by: Olga Kornievskaia Signed-off-by: Anna Schumaker --- fs/nfs/nfs4client.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 5ae9d64..8346ccb 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -1023,9 +1023,9 @@ static void nfs4_session_set_rwsize(struct nfs_server *server) server_resp_sz = sess->fc_attrs.max_resp_sz - nfs41_maxread_overhead; server_rqst_sz = sess->fc_attrs.max_rqst_sz - nfs41_maxwrite_overhead; - if (server->rsize > server_resp_sz) + if (!server->rsize || server->rsize > server_resp_sz) server->rsize = server_resp_sz; - if (server->wsize > server_rqst_sz) + if (!server->wsize || server->wsize > server_rqst_sz) server->wsize = server_rqst_sz; #endif /* CONFIG_NFS_V4_1 */ } -- cgit v1.1 From a33e4b036d4612f62220f37a9fa29d273b6fd0ca Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Thu, 9 Mar 2017 12:56:48 -0500 Subject: pNFS: return status from nfs4_pnfs_ds_connect The nfs4_pnfs_ds_connect path can call rpc_create which can fail or it can wait on another context to reach the same failure. This checks that the rpc_create succeeded and returns the error to the caller. When an error is returned, both the files and flexfiles layouts will return NULL from _prepare_ds(). The flexfiles layout will also return the layout with the error NFS4ERR_NXIO. Signed-off-by: Weston Andros Adamson Signed-off-by: Anna Schumaker --- fs/nfs/client.c | 25 ++++++++++++++++++++++++- fs/nfs/filelayout/filelayoutdev.c | 7 ++++++- fs/nfs/flexfilelayout/flexfilelayoutdev.c | 3 ++- fs/nfs/internal.h | 2 ++ fs/nfs/pnfs.h | 2 +- fs/nfs/pnfs_nfs.c | 15 +++++++++++++-- 6 files changed, 48 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 91a8d61..390ada8 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -325,10 +325,33 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat return NULL; } -static bool nfs_client_init_is_complete(const struct nfs_client *clp) +/* + * Return true if @clp is done initializing, false if still working on it. + * + * Use nfs_client_init_status to check if it was successful. + */ +bool nfs_client_init_is_complete(const struct nfs_client *clp) { return clp->cl_cons_state <= NFS_CS_READY; } +EXPORT_SYMBOL_GPL(nfs_client_init_is_complete); + +/* + * Return 0 if @clp was successfully initialized, -errno otherwise. + * + * This must be called *after* nfs_client_init_is_complete() returns true, + * otherwise it will pop WARN_ON_ONCE and return -EINVAL + */ +int nfs_client_init_status(const struct nfs_client *clp) +{ + /* called without checking nfs_client_init_is_complete */ + if (clp->cl_cons_state > NFS_CS_READY) { + WARN_ON_ONCE(1); + return -EINVAL; + } + return clp->cl_cons_state; +} +EXPORT_SYMBOL_GPL(nfs_client_init_status); int nfs_wait_client_init_complete(const struct nfs_client *clp) { diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c index f956ca2..1881206 100644 --- a/fs/nfs/filelayout/filelayoutdev.c +++ b/fs/nfs/filelayout/filelayoutdev.c @@ -266,6 +266,7 @@ nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx) struct nfs4_deviceid_node *devid = FILELAYOUT_DEVID_NODE(lseg); struct nfs4_pnfs_ds *ret = ds; struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode); + int status; if (ds == NULL) { printk(KERN_ERR "NFS: %s: No data server for offset index %d\n", @@ -277,9 +278,13 @@ nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx) if (ds->ds_clp) goto out_test_devid; - nfs4_pnfs_ds_connect(s, ds, devid, dataserver_timeo, + status = nfs4_pnfs_ds_connect(s, ds, devid, dataserver_timeo, dataserver_retrans, 4, s->nfs_client->cl_minorversion); + if (status) { + ret = NULL; + goto out; + } out_test_devid: if (ret->ds_clp == NULL || diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index e5a6f24..544e772 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -384,6 +384,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, struct inode *ino = lseg->pls_layout->plh_inode; struct nfs_server *s = NFS_SERVER(ino); unsigned int max_payload; + int status; if (!ff_layout_mirror_valid(lseg, mirror, true)) { pr_err_ratelimited("NFS: %s: No data server for offset index %d\n", @@ -404,7 +405,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, /* FIXME: For now we assume the server sent only one version of NFS * to use for the DS. */ - nfs4_pnfs_ds_connect(s, ds, devid, dataserver_timeo, + status = nfs4_pnfs_ds_connect(s, ds, devid, dataserver_timeo, dataserver_retrans, mirror->mirror_ds->ds_versions[0].version, mirror->mirror_ds->ds_versions[0].minor_version); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 09ca5095..7b38fed 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -186,6 +186,8 @@ extern struct nfs_server *nfs_clone_server(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, rpc_authflavor_t); +extern bool nfs_client_init_is_complete(const struct nfs_client *clp); +extern int nfs_client_init_status(const struct nfs_client *clp); extern int nfs_wait_client_init_complete(const struct nfs_client *clp); extern void nfs_mark_client_ready(struct nfs_client *clp, int state); extern struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 63f77b4..590e1e3 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -367,7 +367,7 @@ void nfs4_pnfs_ds_put(struct nfs4_pnfs_ds *ds); struct nfs4_pnfs_ds *nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags); void nfs4_pnfs_v3_ds_connect_unload(void); -void nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds, +int nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds, struct nfs4_deviceid_node *devid, unsigned int timeo, unsigned int retrans, u32 version, u32 minor_version); struct nfs4_pnfs_ds_addr *nfs4_decode_mp_ds_addr(struct net *net, diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 9414b49..a7691b9 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -745,9 +745,9 @@ out: /* * Create an rpc connection to the nfs4_pnfs_ds data server. * Currently only supports IPv4 and IPv6 addresses. - * If connection fails, make devid unavailable. + * If connection fails, make devid unavailable and return a -errno. */ -void nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds, +int nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds, struct nfs4_deviceid_node *devid, unsigned int timeo, unsigned int retrans, u32 version, u32 minor_version) { @@ -772,6 +772,17 @@ void nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds, } else { nfs4_wait_ds_connect(ds); } + + /* + * At this point the ds->ds_clp should be ready, but it might have + * hit an error. + */ + if (!ds->ds_clp || !nfs_client_init_is_complete(ds->ds_clp)) { + WARN_ON_ONCE(1); + return -EINVAL; + } + + return nfs_client_init_status(ds->ds_clp); } EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_connect); -- cgit v1.1 From da066f3f039eba3e72e97b2ccad0dd8b45ba84bd Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Thu, 9 Mar 2017 12:56:49 -0500 Subject: pNFS/flexfiles: never nfs4_mark_deviceid_unavailable The flexfiles layout should never mark a device unavailable. Move nfs4_mark_deviceid_unavailable out of nfs4_pnfs_ds_connect and call directly from files layout where it's still needed. The flexfiles driver still handles marked devices in error paths, but will now print a rate limited warning. Signed-off-by: Weston Andros Adamson Signed-off-by: Anna Schumaker --- fs/nfs/filelayout/filelayoutdev.c | 1 + fs/nfs/flexfilelayout/flexfilelayout.h | 14 +++++++++++++- fs/nfs/flexfilelayout/flexfilelayoutdev.c | 2 +- fs/nfs/pnfs_nfs.c | 24 ++++++++++++++++-------- 4 files changed, 31 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c index 1881206..d913e81 100644 --- a/fs/nfs/filelayout/filelayoutdev.c +++ b/fs/nfs/filelayout/filelayoutdev.c @@ -282,6 +282,7 @@ nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx) dataserver_retrans, 4, s->nfs_client->cl_minorversion); if (status) { + nfs4_mark_deviceid_unavailable(devid); ret = NULL; goto out; } diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h index f4f39b0..98b34c9 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.h +++ b/fs/nfs/flexfilelayout/flexfilelayout.h @@ -175,7 +175,19 @@ ff_layout_no_read_on_rw(struct pnfs_layout_segment *lseg) static inline bool ff_layout_test_devid_unavailable(struct nfs4_deviceid_node *node) { - return nfs4_test_deviceid_unavailable(node); + /* + * Flexfiles should never mark a DS unavailable, but if it does + * print a (ratelimited) warning as this can affect performance. + */ + if (nfs4_test_deviceid_unavailable(node)) { + u32 *p = (u32 *)node->deviceid.data; + + pr_warn_ratelimited("NFS: flexfiles layout referencing an " + "unavailable device [%x%x%x%x]\n", + p[0], p[1], p[2], p[3]); + return true; + } + return false; } static inline int diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index 544e772..85fde93 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -421,11 +421,11 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, mirror->mirror_ds->ds_versions[0].wsize = max_payload; goto out; } +out_fail: ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout), mirror, lseg->pls_range.offset, lseg->pls_range.length, NFS4ERR_NXIO, OP_ILLEGAL, GFP_NOIO); -out_fail: if (fail_return || !ff_layout_has_available_ds(lseg)) pnfs_error_mark_layout_for_return(ino, lseg); ds = NULL; diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index a7691b9..7250b95 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -751,9 +751,11 @@ int nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds, struct nfs4_deviceid_node *devid, unsigned int timeo, unsigned int retrans, u32 version, u32 minor_version) { - if (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) == 0) { - int err = 0; + int err; +again: + err = 0; + if (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) == 0) { if (version == 3) { err = _nfs4_pnfs_v3_ds_connect(mds_srv, ds, timeo, retrans); @@ -766,23 +768,29 @@ int nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds, err = -EPROTONOSUPPORT; } - if (err) - nfs4_mark_deviceid_unavailable(devid); nfs4_clear_ds_conn_bit(ds); } else { nfs4_wait_ds_connect(ds); + + /* what was waited on didn't connect AND didn't mark unavail */ + if (!ds->ds_clp && !nfs4_test_deviceid_unavailable(devid)) + goto again; } /* * At this point the ds->ds_clp should be ready, but it might have * hit an error. */ - if (!ds->ds_clp || !nfs_client_init_is_complete(ds->ds_clp)) { - WARN_ON_ONCE(1); - return -EINVAL; + if (!err) { + if (!ds->ds_clp || !nfs_client_init_is_complete(ds->ds_clp)) { + WARN_ON_ONCE(ds->ds_clp || + !nfs4_test_deviceid_unavailable(devid)); + return -EINVAL; + } + err = nfs_client_init_status(ds->ds_clp); } - return nfs_client_init_status(ds->ds_clp); + return err; } EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_connect); -- cgit v1.1