From a821df3f1af72aa6a0d573eea94a7dd2613e9f4e Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Tue, 21 Nov 2017 09:36:33 +1100 Subject: cifs: fix NULL deref in SMB2_read Signed-off-by: Ronnie Sahlberg Reviewed-by: Pavel Shilovsky CC: Stable Signed-off-by: Steve French --- fs/cifs/smb2pdu.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 5331631..01346b8 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -2678,27 +2678,27 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, cifs_small_buf_release(req); rsp = (struct smb2_read_rsp *)rsp_iov.iov_base; - shdr = get_sync_hdr(rsp); - if (shdr->Status == STATUS_END_OF_FILE) { + if (rc) { + if (rc != -ENODATA) { + cifs_stats_fail_inc(io_parms->tcon, SMB2_READ_HE); + cifs_dbg(VFS, "Send error in read = %d\n", rc); + } free_rsp_buf(resp_buftype, rsp_iov.iov_base); - return 0; + return rc == -ENODATA ? 0 : rc; } - if (rc) { - cifs_stats_fail_inc(io_parms->tcon, SMB2_READ_HE); - cifs_dbg(VFS, "Send error in read = %d\n", rc); - } else { - *nbytes = le32_to_cpu(rsp->DataLength); - if ((*nbytes > CIFS_MAX_MSGSIZE) || - (*nbytes > io_parms->length)) { - cifs_dbg(FYI, "bad length %d for count %d\n", - *nbytes, io_parms->length); - rc = -EIO; - *nbytes = 0; - } + *nbytes = le32_to_cpu(rsp->DataLength); + if ((*nbytes > CIFS_MAX_MSGSIZE) || + (*nbytes > io_parms->length)) { + cifs_dbg(FYI, "bad length %d for count %d\n", + *nbytes, io_parms->length); + rc = -EIO; + *nbytes = 0; } + shdr = get_sync_hdr(rsp); + if (*buf) { memcpy(*buf, (char *)shdr + rsp->DataOffset, *nbytes); free_rsp_buf(resp_buftype, rsp_iov.iov_base); -- cgit v1.1 From 5702591fc6a3f409f460def104ee149330dac82d Mon Sep 17 00:00:00 2001 From: Aurelien Aptel Date: Tue, 21 Nov 2017 14:47:56 +0100 Subject: CIFS: don't log STATUS_NOT_FOUND errors for DFS cifs.ko makes DFS queries regardless of the type of the server and non-DFS servers are common. This often results in superfluous logging of non-critical errors. Signed-off-by: Aurelien Aptel Signed-off-by: Steve French Reviewed-by: Ronnie Sahlberg --- fs/cifs/smb2ops.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index e067404..ed88ab8 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -1406,7 +1406,8 @@ smb2_get_dfs_refer(const unsigned int xid, struct cifs_ses *ses, } while (rc == -EAGAIN); if (rc) { - cifs_dbg(VFS, "ioctl error in smb2_get_dfs_refer rc=%d\n", rc); + if (rc != -ENOENT) + cifs_dbg(VFS, "ioctl error in smb2_get_dfs_refer rc=%d\n", rc); goto out; } -- cgit v1.1 From b430b7751286b3acff2d324553c8cec4f1e87764 Mon Sep 17 00:00:00 2001 From: Justin Maggard Date: Mon, 30 Oct 2017 15:29:10 -0700 Subject: btrfs: Fix quota reservation leak on preallocated files Commit c6887cd11149 ("Btrfs: don't do nocow check unless we have to") changed the behavior of __btrfs_buffered_write() so that it first tries to get a data space reservation, and then skips the relatively expensive nocow check if the reservation succeeded. If we have quotas enabled, the data space reservation also includes a quota reservation. But in the rewrite case, the space has already been accounted for in qgroups. So btrfs_check_data_free_space() increases the quota reservation, but it never gets decreased when the data actually gets written and overwrites the pre-existing data. So we're left with both the qgroup and qgroup reservation accounting for the same space. This commit adds the missing btrfs_qgroup_free_data() call in the case of BTRFS_ORDERED_PREALLOC extents. Fixes: c6887cd11149 ("Btrfs: don't do nocow check unless we have to") Signed-off-by: Justin Maggard Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/inode.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 993061f..e1a7f3c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3005,6 +3005,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) compress_type = ordered_extent->compress_type; if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { BUG_ON(compress_type); + btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset, + ordered_extent->len); ret = btrfs_mark_extent_written(trans, BTRFS_I(inode), ordered_extent->file_offset, ordered_extent->file_offset + -- cgit v1.1 From 692826b2738101549f032a761a9191636e83be4e Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Tue, 21 Nov 2017 13:58:49 -0500 Subject: btrfs: handle errors while updating refcounts in update_ref_for_cow Since commit fb235dc06fa (btrfs: qgroup: Move half of the qgroup accounting time out of commit trans) the assumption that btrfs_add_delayed_{data,tree}_ref can only return 0 or -ENOMEM has been false. The qgroup operations call into btrfs_search_slot and friends and can now return the full spectrum of error codes. Fortunately, the fix here is easy since update_ref_for_cow failing is already handled so we just need to bail early with the error code. Fixes: fb235dc06fa (btrfs: qgroup: Move half of the qgroup accounting ...) Cc: # v4.11+ Signed-off-by: Jeff Mahoney Reviewed-by: Edmund Nadolski Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 531e0a8..1e74cf8 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1032,14 +1032,17 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { ret = btrfs_inc_ref(trans, root, buf, 1); - BUG_ON(ret); /* -ENOMEM */ + if (ret) + return ret; if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { ret = btrfs_dec_ref(trans, root, buf, 0); - BUG_ON(ret); /* -ENOMEM */ + if (ret) + return ret; ret = btrfs_inc_ref(trans, root, cow, 1); - BUG_ON(ret); /* -ENOMEM */ + if (ret) + return ret; } new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; } else { @@ -1049,7 +1052,8 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, ret = btrfs_inc_ref(trans, root, cow, 1); else ret = btrfs_inc_ref(trans, root, cow, 0); - BUG_ON(ret); /* -ENOMEM */ + if (ret) + return ret; } if (new_flags != 0) { int level = btrfs_header_level(buf); @@ -1068,9 +1072,11 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, ret = btrfs_inc_ref(trans, root, cow, 1); else ret = btrfs_inc_ref(trans, root, cow, 0); - BUG_ON(ret); /* -ENOMEM */ + if (ret) + return ret; ret = btrfs_dec_ref(trans, root, buf, 1); - BUG_ON(ret); /* -ENOMEM */ + if (ret) + return ret; } clean_tree_block(fs_info, buf); *last_ref = 1; -- cgit v1.1 From e19182c0fff451e3744c1107d98f072e7ca377a0 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Mon, 4 Dec 2017 13:11:45 -0500 Subject: btrfs: fix missing error return in btrfs_drop_snapshot If btrfs_del_root fails in btrfs_drop_snapshot, we'll pick up the error but then return 0 anyway due to mixing err and ret. Fixes: 79787eaab4612 ("btrfs: replace many BUG_ONs with proper error handling") Cc: # v3.4+ Signed-off-by: Jeff Mahoney Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 784d41e..16e46ee 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -9220,6 +9220,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, ret = btrfs_del_root(trans, fs_info, &root->root_key); if (ret) { btrfs_abort_transaction(trans, ret); + err = ret; goto out_end_trans; } -- cgit v1.1 From 1b9e619c5bc8235cfba3dc4ced2fb0e3554a05d4 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Tue, 5 Dec 2017 22:54:02 -0800 Subject: Btrfs: disable FUA if mounted with nobarrier I was seeing disk flushes still happening when I mounted a Btrfs filesystem with nobarrier for testing. This is because we use FUA to write out the first super block, and on devices without FUA support, the block layer translates FUA to a flush. Even on devices supporting true FUA, using FUA when we asked for no barriers is surprising. Fixes: 387125fc722a8ed ("Btrfs: fix barrier flushes") Signed-off-by: Omar Sandoval Reviewed-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 10a2a57..a8ecccf 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3231,6 +3231,7 @@ static int write_dev_supers(struct btrfs_device *device, int errors = 0; u32 crc; u64 bytenr; + int op_flags; if (max_mirrors == 0) max_mirrors = BTRFS_SUPER_MIRROR_MAX; @@ -3273,13 +3274,10 @@ static int write_dev_supers(struct btrfs_device *device, * we fua the first super. The others we allow * to go down lazy. */ - if (i == 0) { - ret = btrfsic_submit_bh(REQ_OP_WRITE, - REQ_SYNC | REQ_FUA | REQ_META | REQ_PRIO, bh); - } else { - ret = btrfsic_submit_bh(REQ_OP_WRITE, - REQ_SYNC | REQ_META | REQ_PRIO, bh); - } + op_flags = REQ_SYNC | REQ_META | REQ_PRIO; + if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER)) + op_flags |= REQ_FUA; + ret = btrfsic_submit_bh(REQ_OP_WRITE, op_flags, bh); if (ret) errors++; } -- cgit v1.1 From c8bcbfbd239ed60a6562964b58034ac8a25f4c31 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 1 Dec 2017 11:19:42 +0200 Subject: btrfs: Fix possible off-by-one in btrfs_search_path_in_tree The name char array passed to btrfs_search_path_in_tree is of size BTRFS_INO_LOOKUP_PATH_MAX (4080). So the actual accessible char indexes are in the range of [0, 4079]. Currently the code uses the define but this represents an off-by-one. Implications: Size of btrfs_ioctl_ino_lookup_args is 4096, so the new byte will be written to extra space, not some padding that could be provided by the allocator. btrfs-progs store the arguments on stack, but kernel does own copy of the ioctl buffer and the off-by-one overwrite does not affect userspace, but the ending 0 might be lost. Kernel ioctl buffer is allocated dynamically so we're overwriting somebody else's memory, and the ioctl is privileged if args.objectid is not 256. Which is in most cases, but resolving a subvolume stored in another directory will trigger that path. Before this patch the buffer was one byte larger, but then the -1 was not added. Fixes: ac8e9819d71f907 ("Btrfs: add search and inode lookup ioctls") Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba [ added implications ] Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index fd172a9..1a508ff 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2206,7 +2206,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info, if (!path) return -ENOMEM; - ptr = &name[BTRFS_INO_LOOKUP_PATH_MAX]; + ptr = &name[BTRFS_INO_LOOKUP_PATH_MAX - 1]; key.objectid = tree_id; key.type = BTRFS_ROOT_ITEM_KEY; -- cgit v1.1 From 040d786032bf59002d374b86d75b04d97624005c Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 30 Nov 2017 11:59:22 +0800 Subject: ceph: drop negative child dentries before try pruning inode's alias Negative child dentry holds reference on inode's alias, it makes d_prune_aliases() do nothing. Cc: stable@vger.kernel.org Signed-off-by: "Yan, Zheng" Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 42 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 38 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index ab69dcb..1b46825 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1440,6 +1440,29 @@ static int __close_session(struct ceph_mds_client *mdsc, return request_close_session(mdsc, session); } +static bool drop_negative_children(struct dentry *dentry) +{ + struct dentry *child; + bool all_negative = true; + + if (!d_is_dir(dentry)) + goto out; + + spin_lock(&dentry->d_lock); + list_for_each_entry(child, &dentry->d_subdirs, d_child) { + if (d_really_is_positive(child)) { + all_negative = false; + break; + } + } + spin_unlock(&dentry->d_lock); + + if (all_negative) + shrink_dcache_parent(dentry); +out: + return all_negative; +} + /* * Trim old(er) caps. * @@ -1490,16 +1513,27 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) if ((used | wanted) & ~oissued & mine) goto out; /* we need these caps */ - session->s_trim_caps--; if (oissued) { /* we aren't the only cap.. just remove us */ __ceph_remove_cap(cap, true); + session->s_trim_caps--; } else { + struct dentry *dentry; /* try dropping referring dentries */ spin_unlock(&ci->i_ceph_lock); - d_prune_aliases(inode); - dout("trim_caps_cb %p cap %p pruned, count now %d\n", - inode, cap, atomic_read(&inode->i_count)); + dentry = d_find_any_alias(inode); + if (dentry && drop_negative_children(dentry)) { + int count; + dput(dentry); + d_prune_aliases(inode); + count = atomic_read(&inode->i_count); + if (count == 1) + session->s_trim_caps--; + dout("trim_caps_cb %p cap %p pruned, count now %d\n", + inode, cap, count); + } else { + dput(dentry); + } return 0; } -- cgit v1.1 From eaf0ec303bd73f6b2c18f48542974a710fadfeb9 Mon Sep 17 00:00:00 2001 From: Pravin Shedge Date: Wed, 6 Dec 2017 10:16:15 -0800 Subject: fs: xfs: remove duplicate includes These duplicate includes have been found with scripts/checkincludes.pl but they have been removed manually to avoid removing false positives. Signed-off-by: Pravin Shedge Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/scrub/scrub.c | 1 - fs/xfs/scrub/trace.c | 1 - fs/xfs/xfs_reflink.c | 2 -- fs/xfs/xfs_trace.c | 1 - 4 files changed, 5 deletions(-) (limited to 'fs') diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 9c42c4e..ab3aef2 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -46,7 +46,6 @@ #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" -#include "scrub/scrub.h" #include "scrub/btree.h" /* diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c index 472080e..86daed0 100644 --- a/fs/xfs/scrub/trace.c +++ b/fs/xfs/scrub/trace.c @@ -26,7 +26,6 @@ #include "xfs_mount.h" #include "xfs_defer.h" #include "xfs_da_format.h" -#include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_trans.h" diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index cc041a2..cf7c8f8 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -49,8 +49,6 @@ #include "xfs_alloc.h" #include "xfs_quota_defs.h" #include "xfs_quota.h" -#include "xfs_btree.h" -#include "xfs_bmap_btree.h" #include "xfs_reflink.h" #include "xfs_iomap.h" #include "xfs_rmap_btree.h" diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index 5d95fe3..35f3546 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -24,7 +24,6 @@ #include "xfs_mount.h" #include "xfs_defer.h" #include "xfs_da_format.h" -#include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_da_btree.h" -- cgit v1.1 From f59cf5c29919d17b61913c3360a7bd29b72975c1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Dec 2017 17:32:55 -0800 Subject: xfs: remove "no-allocation" reservations for file creations If we create a new file we will need an inode, and usually some metadata in the parent direction. Aiming for everything to go well despite the lack of a reservation leads to dirty transactions cancelled under a heavy create/delete load. This patch removes those nospace transactions, which will lead to slightly earlier ENOSPC on some workloads, but instead prevent file system shutdowns due to cancelling dirty transactions for others. A customer could observe assertations failures and shutdowns due to cancelation of dirty transactions during heavy NFS workloads as shown below: 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728125] XFS: Assertion failed: error != -ENOSPC, file: fs/xfs/xfs_inode.c, line: 1262 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728222] Call Trace: 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728246] [] dump_stack+0x63/0x81 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728262] [] warn_slowpath_common+0x8a/0xc0 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728264] [] warn_slowpath_null+0x1a/0x20 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728285] [] asswarn+0x33/0x40 [xfs] 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728308] [] xfs_create+0x7be/0x7d0 [xfs] 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728329] [] xfs_generic_create+0x1fb/0x2e0 [xfs] 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728348] [] xfs_vn_mknod+0x14/0x20 [xfs] 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728366] [] xfs_vn_create+0x13/0x20 [xfs] 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728380] [] vfs_create+0xd5/0x140 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728390] [] do_nfsd_create+0x499/0x610 [nfsd] 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728396] [] nfsd3_proc_create+0x135/0x210 [nfsd] 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728401] [] nfsd_dispatch+0xc3/0x210 [nfsd] 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728416] [] svc_process_common+0x453/0x6f0 [sunrpc] 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728423] [] svc_process+0x113/0x1f0 [sunrpc] 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728427] [] nfsd+0x10f/0x180 [nfsd] 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728432] [] ? nfsd_destroy+0x80/0x80 [nfsd] 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728438] [] kthread+0xd8/0xf0 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728441] [] ? kthread_create_on_node+0x1b0/0x1b0 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728451] [] ret_from_fork+0x42/0x70 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728453] [] ? kthread_create_on_node+0x1b0/0x1b0 2017-05-30 21:17:06 kernel: WARNING: [ 2670.728454] ---[ end trace f9822c842fec81d4 ]--- 2017-05-30 21:17:06 kernel: ALERT: [ 2670.728477] XFS (sdb): Internal error xfs_trans_cancel at line 983 of file fs/xfs/xfs_trans.c. Caller xfs_create+0x4ee/0x7d0 [xfs] 2017-05-30 21:17:06 kernel: ALERT: [ 2670.728684] XFS (sdb): Corruption of in-memory data detected. Shutting down filesystem 2017-05-30 21:17:06 kernel: ALERT: [ 2670.728685] XFS (sdb): Please umount the filesystem and rectify the problem(s) Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_ialloc.c | 10 +++------- fs/xfs/libxfs/xfs_ialloc.h | 1 - fs/xfs/xfs_inode.c | 33 +++++++-------------------------- fs/xfs/xfs_inode.h | 2 +- fs/xfs/xfs_qm.c | 4 ++-- fs/xfs/xfs_symlink.c | 15 +-------------- 6 files changed, 14 insertions(+), 51 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index de3f04a..3b57ef0 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -920,8 +920,7 @@ STATIC xfs_agnumber_t xfs_ialloc_ag_select( xfs_trans_t *tp, /* transaction pointer */ xfs_ino_t parent, /* parent directory inode number */ - umode_t mode, /* bits set to indicate file type */ - int okalloc) /* ok to allocate more space */ + umode_t mode) /* bits set to indicate file type */ { xfs_agnumber_t agcount; /* number of ag's in the filesystem */ xfs_agnumber_t agno; /* current ag number */ @@ -978,9 +977,6 @@ xfs_ialloc_ag_select( return agno; } - if (!okalloc) - goto nextag; - if (!pag->pagf_init) { error = xfs_alloc_pagf_init(mp, tp, agno, flags); if (error) @@ -1680,7 +1676,6 @@ xfs_dialloc( struct xfs_trans *tp, xfs_ino_t parent, umode_t mode, - int okalloc, struct xfs_buf **IO_agbp, xfs_ino_t *inop) { @@ -1692,6 +1687,7 @@ xfs_dialloc( int noroom = 0; xfs_agnumber_t start_agno; struct xfs_perag *pag; + int okalloc = 1; if (*IO_agbp) { /* @@ -1707,7 +1703,7 @@ xfs_dialloc( * We do not have an agbp, so select an initial allocation * group for inode allocation. */ - start_agno = xfs_ialloc_ag_select(tp, parent, mode, okalloc); + start_agno = xfs_ialloc_ag_select(tp, parent, mode); if (start_agno == NULLAGNUMBER) { *inop = NULLFSINO; return 0; diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index d2bdcd5..66a8de0 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -81,7 +81,6 @@ xfs_dialloc( struct xfs_trans *tp, /* transaction pointer */ xfs_ino_t parent, /* parent inode (directory) */ umode_t mode, /* mode bits for new inode */ - int okalloc, /* ok to allocate more space */ struct xfs_buf **agbp, /* buf for a.g. inode header */ xfs_ino_t *inop); /* inode number allocated */ diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 8012741..b41952a 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -749,7 +749,6 @@ xfs_ialloc( xfs_nlink_t nlink, dev_t rdev, prid_t prid, - int okalloc, xfs_buf_t **ialloc_context, xfs_inode_t **ipp) { @@ -765,7 +764,7 @@ xfs_ialloc( * Call the space management code to pick * the on-disk inode to be allocated. */ - error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, okalloc, + error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, ialloc_context, &ino); if (error) return error; @@ -957,7 +956,6 @@ xfs_dir_ialloc( xfs_nlink_t nlink, dev_t rdev, prid_t prid, /* project id */ - int okalloc, /* ok to allocate new space */ xfs_inode_t **ipp, /* pointer to inode; it will be locked. */ int *committed) @@ -988,8 +986,8 @@ xfs_dir_ialloc( * transaction commit so that no other process can steal * the inode(s) that we've just allocated. */ - code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, okalloc, - &ialloc_context, &ip); + code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, &ialloc_context, + &ip); /* * Return an error if we were unable to allocate a new inode. @@ -1061,7 +1059,7 @@ xfs_dir_ialloc( * this call should always succeed. */ code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, - okalloc, &ialloc_context, &ip); + &ialloc_context, &ip); /* * If we get an error at this point, return to the caller @@ -1182,11 +1180,6 @@ xfs_create( xfs_flush_inodes(mp); error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp); } - if (error == -ENOSPC) { - /* No space at all so try a "no-allocation" reservation */ - resblks = 0; - error = xfs_trans_alloc(mp, tres, 0, 0, 0, &tp); - } if (error) goto out_release_inode; @@ -1203,19 +1196,13 @@ xfs_create( if (error) goto out_trans_cancel; - if (!resblks) { - error = xfs_dir_canenter(tp, dp, name); - if (error) - goto out_trans_cancel; - } - /* * A newly created regular or special file just has one directory * entry pointing to them, but a directory also the "." entry * pointing to itself. */ - error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, - prid, resblks > 0, &ip, NULL); + error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, prid, &ip, + NULL); if (error) goto out_trans_cancel; @@ -1340,11 +1327,6 @@ xfs_create_tmpfile( tres = &M_RES(mp)->tr_create_tmpfile; error = xfs_trans_alloc(mp, tres, resblks, 0, 0, &tp); - if (error == -ENOSPC) { - /* No space at all so try a "no-allocation" reservation */ - resblks = 0; - error = xfs_trans_alloc(mp, tres, 0, 0, 0, &tp); - } if (error) goto out_release_inode; @@ -1353,8 +1335,7 @@ xfs_create_tmpfile( if (error) goto out_trans_cancel; - error = xfs_dir_ialloc(&tp, dp, mode, 1, 0, - prid, resblks > 0, &ip, NULL); + error = xfs_dir_ialloc(&tp, dp, mode, 1, 0, prid, &ip, NULL); if (error) goto out_trans_cancel; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index cc13c37..b2136af 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -428,7 +428,7 @@ xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); xfs_extlen_t xfs_get_cowextsz_hint(struct xfs_inode *ip); int xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t, - xfs_nlink_t, dev_t, prid_t, int, + xfs_nlink_t, dev_t, prid_t, struct xfs_inode **, int *); /* from xfs_file.c */ diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 010a13a..ec952df 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -793,8 +793,8 @@ xfs_qm_qino_alloc( return error; if (need_alloc) { - error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip, - &committed); + error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, ip, + &committed); if (error) { xfs_trans_cancel(tp); return error; diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 68d3ca2..2e9e793 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -232,11 +232,6 @@ xfs_symlink( resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks); error = xfs_trans_alloc(mp, &M_RES(mp)->tr_symlink, resblks, 0, 0, &tp); - if (error == -ENOSPC && fs_blocks == 0) { - resblks = 0; - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_symlink, 0, 0, 0, - &tp); - } if (error) goto out_release_inode; @@ -260,14 +255,6 @@ xfs_symlink( goto out_trans_cancel; /* - * Check for ability to enter directory entry, if no space reserved. - */ - if (!resblks) { - error = xfs_dir_canenter(tp, dp, link_name); - if (error) - goto out_trans_cancel; - } - /* * Initialize the bmap freelist prior to calling either * bmapi or the directory create code. */ @@ -277,7 +264,7 @@ xfs_symlink( * Allocate an inode for the symlink. */ error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0, - prid, resblks > 0, &ip, NULL); + prid, &ip, NULL); if (error) goto out_trans_cancel; -- cgit v1.1 From b7e0b6ff54dd92febbb1914ab93cd6a21622e169 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 6 Dec 2017 16:13:35 -0800 Subject: xfs: make iomap_begin functions trim iomaps consistently Historically, the XFS iomap_begin function only returned mappings for exactly the range queried, i.e. it doesn't do XFS_BMAPI_ENTIRE lookups. The current vfs iomap consumers are only set up to deal with trimmed mappings. xfs_xattr_iomap_begin does BMAPI_ENTIRE lookups, which is inconsistent with the current iomap usage. Remove the flag so that both iomap_begin functions behave the same way. FWIW this also fixes a behavioral regression in xattr FIEMAP that was introduced in 4.8 wherein attr fork extents are no longer trimmed like they used to be. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_iomap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 33eb4fb..7ab52a8 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1213,7 +1213,7 @@ xfs_xattr_iomap_begin( ASSERT(ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL); error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, - &nimaps, XFS_BMAPI_ENTIRE | XFS_BMAPI_ATTRFORK); + &nimaps, XFS_BMAPI_ATTRFORK); out_unlock: xfs_iunlock(ip, lockmode); -- cgit v1.1 From 98087c05b9fc4ff8935bfc8f4b71afb4251c8867 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 23 Nov 2017 20:34:40 +0100 Subject: hpfs: don't bother with the i_version counter or f_version HPFS does not set SB_I_VERSION and does not use the i_version counter internally. Signed-off-by: Jeff Layton Signed-off-by: Mikulas Patocka Reviewed-by: Mikulas Patocka Signed-off-by: Linus Torvalds --- fs/hpfs/dir.c | 1 - fs/hpfs/dnode.c | 2 -- fs/hpfs/super.c | 1 - 3 files changed, 4 deletions(-) (limited to 'fs') diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c index 8d6b7e3..c83ece7 100644 --- a/fs/hpfs/dir.c +++ b/fs/hpfs/dir.c @@ -150,7 +150,6 @@ static int hpfs_readdir(struct file *file, struct dir_context *ctx) if (unlikely(ret < 0)) goto out; ctx->pos = ((loff_t) hpfs_de_as_down_as_possible(inode->i_sb, hpfs_inode->i_dno) << 4) + 1; - file->f_version = inode->i_version; } next_pos = ctx->pos; if (!(de = map_pos_dirent(inode, &next_pos, &qbh))) { diff --git a/fs/hpfs/dnode.c b/fs/hpfs/dnode.c index 3b83456..a4ad18a 100644 --- a/fs/hpfs/dnode.c +++ b/fs/hpfs/dnode.c @@ -419,7 +419,6 @@ int hpfs_add_dirent(struct inode *i, c = 1; goto ret; } - i->i_version++; c = hpfs_add_to_dnode(i, dno, name, namelen, new_de, 0); ret: return c; @@ -726,7 +725,6 @@ int hpfs_remove_dirent(struct inode *i, dnode_secno dno, struct hpfs_dirent *de, return 2; } } - i->i_version++; for_all_poss(i, hpfs_pos_del, (t = get_pos(dnode, de)) + 1, 1); hpfs_delete_de(i->i_sb, dnode, de); hpfs_mark_4buffers_dirty(qbh); diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index c45a3b9..f2c3ebc 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -235,7 +235,6 @@ static struct inode *hpfs_alloc_inode(struct super_block *sb) ei = kmem_cache_alloc(hpfs_inode_cachep, GFP_NOFS); if (!ei) return NULL; - ei->vfs_inode.i_version = 1; return &ei->vfs_inode; } -- cgit v1.1 From 438c84c2f0c794f75ab55ce65c505b01bfce4480 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 11 Dec 2017 11:28:10 +0100 Subject: ovl: don't follow redirects if redirect_dir=off Overlayfs is following redirects even when redirects are disabled. If this is unintentional (probably the majority of cases) then this can be a problem. E.g. upper layer comes from untrusted USB drive, and attacker crafts a redirect to enable read access to otherwise unreadable directories. If "redirect_dir=off", then turn off following as well as creation of redirects. If "redirect_dir=follow", then turn on following, but turn off creation of redirects (which is what "redirect_dir=off" does now). This is a backward incompatible change, so make it dependent on a config option. Reported-by: David Howells Signed-off-by: Miklos Szeredi --- fs/overlayfs/Kconfig | 10 +++++++ fs/overlayfs/namei.c | 16 ++++++++++++ fs/overlayfs/ovl_entry.h | 2 ++ fs/overlayfs/super.c | 68 ++++++++++++++++++++++++++++++++++++------------ 4 files changed, 79 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig index cbfc196..5ac4154 100644 --- a/fs/overlayfs/Kconfig +++ b/fs/overlayfs/Kconfig @@ -24,6 +24,16 @@ config OVERLAY_FS_REDIRECT_DIR an overlay which has redirects on a kernel that doesn't support this feature will have unexpected results. +config OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW + bool "Overlayfs: follow redirects even if redirects are turned off" + default y + depends on OVERLAY_FS + help + Disable this to get a possibly more secure configuration, but that + might not be backward compatible with previous kernels. + + For more information, see Documentation/filesystems/overlayfs.txt + config OVERLAY_FS_INDEX bool "Overlayfs: turn on inodes index feature by default" depends on OVERLAY_FS diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 625ed80..2a12dc2 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -681,6 +681,22 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, if (d.stop) break; + /* + * Following redirects can have security consequences: it's like + * a symlink into the lower layer without the permission checks. + * This is only a problem if the upper layer is untrusted (e.g + * comes from an USB drive). This can allow a non-readable file + * or directory to become readable. + * + * Only following redirects when redirects are enabled disables + * this attack vector when not necessary. + */ + err = -EPERM; + if (d.redirect && !ofs->config.redirect_follow) { + pr_warn_ratelimited("overlay: refusing to follow redirect for (%pd2)\n", dentry); + goto out_put; + } + if (d.redirect && d.redirect[0] == '/' && poe != roe) { poe = roe; diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index 752bab6..9d0bc03 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -14,6 +14,8 @@ struct ovl_config { char *workdir; bool default_permissions; bool redirect_dir; + bool redirect_follow; + const char *redirect_mode; bool index; }; diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 288d20f..13a8a86 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -33,6 +33,13 @@ module_param_named(redirect_dir, ovl_redirect_dir_def, bool, 0644); MODULE_PARM_DESC(ovl_redirect_dir_def, "Default to on or off for the redirect_dir feature"); +static bool ovl_redirect_always_follow = + IS_ENABLED(CONFIG_OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW); +module_param_named(redirect_always_follow, ovl_redirect_always_follow, + bool, 0644); +MODULE_PARM_DESC(ovl_redirect_always_follow, + "Follow redirects even if redirect_dir feature is turned off"); + static bool ovl_index_def = IS_ENABLED(CONFIG_OVERLAY_FS_INDEX); module_param_named(index, ovl_index_def, bool, 0644); MODULE_PARM_DESC(ovl_index_def, @@ -232,6 +239,7 @@ static void ovl_free_fs(struct ovl_fs *ofs) kfree(ofs->config.lowerdir); kfree(ofs->config.upperdir); kfree(ofs->config.workdir); + kfree(ofs->config.redirect_mode); if (ofs->creator_cred) put_cred(ofs->creator_cred); kfree(ofs); @@ -295,6 +303,11 @@ static bool ovl_force_readonly(struct ovl_fs *ofs) return (!ofs->upper_mnt || !ofs->workdir); } +static const char *ovl_redirect_mode_def(void) +{ + return ovl_redirect_dir_def ? "on" : "off"; +} + /** * ovl_show_options * @@ -313,12 +326,10 @@ static int ovl_show_options(struct seq_file *m, struct dentry *dentry) } if (ofs->config.default_permissions) seq_puts(m, ",default_permissions"); - if (ofs->config.redirect_dir != ovl_redirect_dir_def) - seq_printf(m, ",redirect_dir=%s", - ofs->config.redirect_dir ? "on" : "off"); + if (strcmp(ofs->config.redirect_mode, ovl_redirect_mode_def()) != 0) + seq_printf(m, ",redirect_dir=%s", ofs->config.redirect_mode); if (ofs->config.index != ovl_index_def) - seq_printf(m, ",index=%s", - ofs->config.index ? "on" : "off"); + seq_printf(m, ",index=%s", ofs->config.index ? "on" : "off"); return 0; } @@ -348,8 +359,7 @@ enum { OPT_UPPERDIR, OPT_WORKDIR, OPT_DEFAULT_PERMISSIONS, - OPT_REDIRECT_DIR_ON, - OPT_REDIRECT_DIR_OFF, + OPT_REDIRECT_DIR, OPT_INDEX_ON, OPT_INDEX_OFF, OPT_ERR, @@ -360,8 +370,7 @@ static const match_table_t ovl_tokens = { {OPT_UPPERDIR, "upperdir=%s"}, {OPT_WORKDIR, "workdir=%s"}, {OPT_DEFAULT_PERMISSIONS, "default_permissions"}, - {OPT_REDIRECT_DIR_ON, "redirect_dir=on"}, - {OPT_REDIRECT_DIR_OFF, "redirect_dir=off"}, + {OPT_REDIRECT_DIR, "redirect_dir=%s"}, {OPT_INDEX_ON, "index=on"}, {OPT_INDEX_OFF, "index=off"}, {OPT_ERR, NULL} @@ -390,10 +399,37 @@ static char *ovl_next_opt(char **s) return sbegin; } +static int ovl_parse_redirect_mode(struct ovl_config *config, const char *mode) +{ + if (strcmp(mode, "on") == 0) { + config->redirect_dir = true; + /* + * Does not make sense to have redirect creation without + * redirect following. + */ + config->redirect_follow = true; + } else if (strcmp(mode, "follow") == 0) { + config->redirect_follow = true; + } else if (strcmp(mode, "off") == 0) { + if (ovl_redirect_always_follow) + config->redirect_follow = true; + } else if (strcmp(mode, "nofollow") != 0) { + pr_err("overlayfs: bad mount option \"redirect_dir=%s\"\n", + mode); + return -EINVAL; + } + + return 0; +} + static int ovl_parse_opt(char *opt, struct ovl_config *config) { char *p; + config->redirect_mode = kstrdup(ovl_redirect_mode_def(), GFP_KERNEL); + if (!config->redirect_mode) + return -ENOMEM; + while ((p = ovl_next_opt(&opt)) != NULL) { int token; substring_t args[MAX_OPT_ARGS]; @@ -428,12 +464,11 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) config->default_permissions = true; break; - case OPT_REDIRECT_DIR_ON: - config->redirect_dir = true; - break; - - case OPT_REDIRECT_DIR_OFF: - config->redirect_dir = false; + case OPT_REDIRECT_DIR: + kfree(config->redirect_mode); + config->redirect_mode = match_strdup(&args[0]); + if (!config->redirect_mode) + return -ENOMEM; break; case OPT_INDEX_ON: @@ -458,7 +493,7 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) config->workdir = NULL; } - return 0; + return ovl_parse_redirect_mode(config, config->redirect_mode); } #define OVL_WORKDIR_NAME "work" @@ -1160,7 +1195,6 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) if (!cred) goto out_err; - ofs->config.redirect_dir = ovl_redirect_dir_def; ofs->config.index = ovl_index_def; err = ovl_parse_opt((char *) data, &ofs->config); if (err) -- cgit v1.1 From 08d8f8a5b094b66b29936e8751b4a818b8db1207 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Mon, 27 Nov 2017 10:12:44 -0500 Subject: ovl: Pass ovl_get_nlink() parameters in right order Right now we seem to be passing index as "lowerdentry" and origin.dentry as "upperdentry". IIUC, we should pass these parameters in reversed order and this looks like a bug. Signed-off-by: Vivek Goyal Acked-by: Amir Goldstein Fixes: caf70cb2ba5d ("ovl: cleanup orphan index entries") Cc: #v4.13 Signed-off-by: Miklos Szeredi --- fs/overlayfs/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 2a12dc2..beb945e 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -435,7 +435,7 @@ int ovl_verify_index(struct dentry *index, struct ovl_path *lower, /* Check if index is orphan and don't warn before cleaning it */ if (d_inode(index)->i_nlink == 1 && - ovl_get_nlink(index, origin.dentry, 0) == 0) + ovl_get_nlink(origin.dentry, index, 0) == 0) err = -ENOENT; dput(origin.dentry); -- cgit v1.1 From b02a16e6413a2f782e542ef60bad9ff6bf212f8a Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 29 Nov 2017 07:35:21 +0200 Subject: ovl: update ctx->pos on impure dir iteration This fixes a regression with readdir of impure dir in overlayfs that is shared to VM via 9p fs. Reported-by: Miguel Bernal Marin Fixes: 4edb83bb1041 ("ovl: constant d_ino for non-merge dirs") Cc: #4.14 Signed-off-by: Amir Goldstein Tested-by: Miguel Bernal Marin Signed-off-by: Miklos Szeredi --- fs/overlayfs/readdir.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 0daa435..5108884 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -663,7 +663,10 @@ static int ovl_iterate_real(struct file *file, struct dir_context *ctx) return PTR_ERR(rdt.cache); } - return iterate_dir(od->realfile, &rdt.ctx); + err = iterate_dir(od->realfile, &rdt.ctx); + ctx->pos = rdt.ctx.pos; + + return err; } -- cgit v1.1 From e8d4bfe3a71537284a90561f77c85dea6c154369 Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Wed, 29 Nov 2017 10:01:32 +0800 Subject: ovl: Sync upper dirty data when syncing overlayfs When executing filesystem sync or umount on overlayfs, dirty data does not get synced as expected on upper filesystem. This patch fixes sync filesystem method to keep data consistency for overlayfs. Signed-off-by: Chengguang Xu Fixes: e593b2bf513d ("ovl: properly implement sync_filesystem()") Cc: #4.11 Signed-off-by: Miklos Szeredi --- fs/overlayfs/super.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 13a8a86..76440fe 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -252,6 +252,7 @@ static void ovl_put_super(struct super_block *sb) ovl_free_fs(ofs); } +/* Sync real dirty inodes in upper filesystem (if it exists) */ static int ovl_sync_fs(struct super_block *sb, int wait) { struct ovl_fs *ofs = sb->s_fs_info; @@ -260,14 +261,24 @@ static int ovl_sync_fs(struct super_block *sb, int wait) if (!ofs->upper_mnt) return 0; - upper_sb = ofs->upper_mnt->mnt_sb; - if (!upper_sb->s_op->sync_fs) + + /* + * If this is a sync(2) call or an emergency sync, all the super blocks + * will be iterated, including upper_sb, so no need to do anything. + * + * If this is a syncfs(2) call, then we do need to call + * sync_filesystem() on upper_sb, but enough if we do it when being + * called with wait == 1. + */ + if (!wait) return 0; - /* real inodes have already been synced by sync_filesystem(ovl_sb) */ + upper_sb = ofs->upper_mnt->mnt_sb; + down_read(&upper_sb->s_umount); - ret = upper_sb->s_op->sync_fs(upper_sb, wait); + ret = sync_filesystem(upper_sb); up_read(&upper_sb->s_umount); + return ret; } -- cgit v1.1 From 7879cb43f9a75710af439c6bd81c94de1aa3d740 Mon Sep 17 00:00:00 2001 From: Vasyl Gomonovych Date: Tue, 28 Nov 2017 00:09:23 +0100 Subject: ovl: Use PTR_ERR_OR_ZERO() Fix ptr_ret.cocci warnings: fs/overlayfs/overlayfs.h:179:11-17: WARNING: PTR_ERR_OR_ZERO can be used Use PTR_ERR_OR_ZERO rather than if(IS_ERR(...)) + PTR_ERR Generated by: scripts/coccinelle/api/ptr_ret.cocci Signed-off-by: Vasyl Gomonovych Signed-off-by: Miklos Szeredi --- fs/overlayfs/overlayfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 13eab09..b489099 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -180,7 +180,7 @@ static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry) static inline struct dentry *ovl_do_tmpfile(struct dentry *dentry, umode_t mode) { struct dentry *ret = vfs_tmpfile(dentry, mode, 0); - int err = IS_ERR(ret) ? PTR_ERR(ret) : 0; + int err = PTR_ERR_OR_ZERO(ret); pr_debug("tmpfile(%pd2, 0%o) = %i\n", dentry, mode, err); return ret; -- cgit v1.1 From da2e6b7eeda8919f677c790ef51161dd02e513a6 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 22 Nov 2017 20:27:34 +0200 Subject: ovl: fix overlay: warning prefix Conform two stray warning messages to the standard overlayfs: prefix. Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/overlayfs/dir.c | 3 ++- fs/overlayfs/readdir.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index e139218..f9788bc 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -887,7 +887,8 @@ static int ovl_set_redirect(struct dentry *dentry, bool samedir) spin_unlock(&dentry->d_lock); } else { kfree(redirect); - pr_warn_ratelimited("overlay: failed to set redirect (%i)\n", err); + pr_warn_ratelimited("overlayfs: failed to set redirect (%i)\n", + err); /* Fall back to userspace copy-up */ err = -EXDEV; } diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 5108884..8c98578 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -499,7 +499,7 @@ out: return err; fail: - pr_warn_ratelimited("overlay: failed to look up (%s) for ino (%i)\n", + pr_warn_ratelimited("overlayfs: failed to look up (%s) for ino (%i)\n", p->name, err); goto out; } -- cgit v1.1 From 302ec300ef8a545a7fc7f667e5fd743b091c2eeb Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 14 Dec 2017 15:32:38 -0800 Subject: autofs: fix careless error in recent commit Commit ecc0c469f277 ("autofs: don't fail mount for transient error") was meant to replace an 'if' with a 'switch', but instead added the 'switch' leaving the case in place. Link: http://lkml.kernel.org/r/87zi6wstmw.fsf@notabene.neil.brown.name Fixes: ecc0c469f277 ("autofs: don't fail mount for transient error") Reported-by: Ben Hutchings Signed-off-by: NeilBrown Cc: Ian Kent Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs4/waitq.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 8fc4170..961a12d 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -170,7 +170,6 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, mutex_unlock(&sbi->wq_mutex); - if (autofs4_write(sbi, pipe, &pkt, pktsz)) switch (ret = autofs4_write(sbi, pipe, &pkt, pktsz)) { case 0: break; -- cgit v1.1 From 3756f6401c302617c5e091081ca4d26ab604bec5 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 14 Dec 2017 15:32:41 -0800 Subject: exec: avoid gcc-8 warning for get_task_comm gcc-8 warns about using strncpy() with the source size as the limit: fs/exec.c:1223:32: error: argument to 'sizeof' in 'strncpy' call is the same expression as the source; did you mean to use the size of the destination? [-Werror=sizeof-pointer-memaccess] This is indeed slightly suspicious, as it protects us from source arguments without NUL-termination, but does not guarantee that the destination is terminated. This keeps the strncpy() to ensure we have properly padded target buffer, but ensures that we use the correct length, by passing the actual length of the destination buffer as well as adding a build-time check to ensure it is exactly TASK_COMM_LEN. There are only 23 callsites which I all reviewed to ensure this is currently the case. We could get away with doing only the check or passing the right length, but it doesn't hurt to do both. Link: http://lkml.kernel.org/r/20171205151724.1764896-1-arnd@arndb.de Signed-off-by: Arnd Bergmann Suggested-by: Kees Cook Acked-by: Kees Cook Acked-by: Ingo Molnar Cc: Alexander Viro Cc: Peter Zijlstra Cc: Serge Hallyn Cc: James Morris Cc: Aleksa Sarai Cc: "Eric W. Biederman" Cc: Frederic Weisbecker Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 6be2aa0..156f56a 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1216,15 +1216,14 @@ killed: return -EAGAIN; } -char *get_task_comm(char *buf, struct task_struct *tsk) +char *__get_task_comm(char *buf, size_t buf_size, struct task_struct *tsk) { - /* buf must be at least sizeof(tsk->comm) in size */ task_lock(tsk); - strncpy(buf, tsk->comm, sizeof(tsk->comm)); + strncpy(buf, tsk->comm, buf_size); task_unlock(tsk); return buf; } -EXPORT_SYMBOL_GPL(get_task_comm); +EXPORT_SYMBOL_GPL(__get_task_comm); /* * These functions flushes out all traces of the currently running executable -- cgit v1.1 From bdcf0a423ea1c40bbb40e7ee483b50fc8aa3d758 Mon Sep 17 00:00:00 2001 From: Thiago Rafael Becker Date: Thu, 14 Dec 2017 15:33:12 -0800 Subject: kernel: make groups_sort calling a responsibility group_info allocators In testing, we found that nfsd threads may call set_groups in parallel for the same entry cached in auth.unix.gid, racing in the call of groups_sort, corrupting the groups for that entry and leading to permission denials for the client. This patch: - Make groups_sort globally visible. - Move the call to groups_sort to the modifiers of group_info - Remove the call to groups_sort from set_groups Link: http://lkml.kernel.org/r/20171211151420.18655-1-thiago.becker@gmail.com Signed-off-by: Thiago Rafael Becker Reviewed-by: Matthew Wilcox Reviewed-by: NeilBrown Acked-by: "J. Bruce Fields" Cc: Al Viro Cc: Martin Schwidefsky Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/auth.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index 697f8ae..f650e47 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c @@ -60,6 +60,9 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) gi->gid[i] = exp->ex_anon_gid; else gi->gid[i] = rqgi->gid[i]; + + /* Each thread allocates its own gi, no race */ + groups_sort(gi); } } else { gi = get_group_info(rqgi); -- cgit v1.1 From c156618e15101a9cc8c815108fec0300a0ec6637 Mon Sep 17 00:00:00 2001 From: Scott Mayhew Date: Tue, 5 Dec 2017 13:55:44 -0500 Subject: nfs: fix a deadlock in nfs client initialization The following deadlock can occur between a process waiting for a client to initialize in while walking the client list during nfsv4 server trunking detection and another process waiting for the nfs_clid_init_mutex so it can initialize that client: Process 1 Process 2 --------- --------- spin_lock(&nn->nfs_client_lock); list_add_tail(&CLIENTA->cl_share_link, &nn->nfs_client_list); spin_unlock(&nn->nfs_client_lock); spin_lock(&nn->nfs_client_lock); list_add_tail(&CLIENTB->cl_share_link, &nn->nfs_client_list); spin_unlock(&nn->nfs_client_lock); mutex_lock(&nfs_clid_init_mutex); nfs41_walk_client_list(clp, result, cred); nfs_wait_client_init_complete(CLIENTA); (waiting for nfs_clid_init_mutex) Make sure nfs_match_client() only evaluates clients that have completed initialization in order to prevent that deadlock. This patch also fixes v4.0 trunking behavior by not marking the client NFS_CS_READY until the clientid has been confirmed. Signed-off-by: Scott Mayhew Signed-off-by: Anna Schumaker --- fs/nfs/client.c | 11 +++++++++++ fs/nfs/nfs4client.c | 17 +++++++++++++---- 2 files changed, 24 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 0ac2fb1..b9129e2 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -291,12 +291,23 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat const struct sockaddr *sap = data->addr; struct nfs_net *nn = net_generic(data->net, nfs_net_id); +again: list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) { const struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr; /* Don't match clients that failed to initialise properly */ if (clp->cl_cons_state < 0) continue; + /* If a client is still initializing then we need to wait */ + if (clp->cl_cons_state > NFS_CS_READY) { + refcount_inc(&clp->cl_count); + spin_unlock(&nn->nfs_client_lock); + nfs_wait_client_init_complete(clp); + nfs_put_client(clp); + spin_lock(&nn->nfs_client_lock); + goto again; + } + /* Different NFS versions cannot share the same nfs_client */ if (clp->rpc_ops != data->nfs_mod->rpc_ops) continue; diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 12bbab0..65a7e5d 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -404,15 +404,19 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp, if (error < 0) goto error; - if (!nfs4_has_session(clp)) - nfs_mark_client_ready(clp, NFS_CS_READY); - error = nfs4_discover_server_trunking(clp, &old); if (error < 0) goto error; - if (clp != old) + if (clp != old) { clp->cl_preserve_clid = true; + /* + * Mark the client as having failed initialization so other + * processes walking the nfs_client_list in nfs_match_client() + * won't try to use it. + */ + nfs_mark_client_ready(clp, -EPERM); + } nfs_put_client(clp); clear_bit(NFS_CS_TSM_POSSIBLE, &clp->cl_flags); return old; @@ -539,6 +543,9 @@ int nfs40_walk_client_list(struct nfs_client *new, spin_lock(&nn->nfs_client_lock); list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) { + if (pos == new) + goto found; + status = nfs4_match_client(pos, new, &prev, nn); if (status < 0) goto out_unlock; @@ -559,6 +566,7 @@ int nfs40_walk_client_list(struct nfs_client *new, * way that a SETCLIENTID_CONFIRM to pos can succeed is * if new and pos point to the same server: */ +found: refcount_inc(&pos->cl_count); spin_unlock(&nn->nfs_client_lock); @@ -572,6 +580,7 @@ int nfs40_walk_client_list(struct nfs_client *new, case 0: nfs4_swap_callback_idents(pos, new); pos->cl_confirm = new->cl_confirm; + nfs_mark_client_ready(pos, NFS_CS_READY); prev = NULL; *result = pos; -- cgit v1.1 From dc4fd9ab01ab379ae5af522b3efd4187a7c30a31 Mon Sep 17 00:00:00 2001 From: Scott Mayhew Date: Fri, 8 Dec 2017 16:00:12 -0500 Subject: nfs: don't wait on commit in nfs_commit_inode() if there were no commit requests If there were no commit requests, then nfs_commit_inode() should not wait on the commit or mark the inode dirty, otherwise the following BUG_ON can be triggered: [ 1917.130762] kernel BUG at fs/inode.c:578! [ 1917.130766] Oops: Exception in kernel mode, sig: 5 [#1] [ 1917.130768] SMP NR_CPUS=2048 NUMA pSeries [ 1917.130772] Modules linked in: iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi blocklayoutdriver rpcsec_gss_krb5 auth_rpcgss nfsv4 dns_resolver nfs lockd grace fscache sunrpc sg nx_crypto pseries_rng ip_tables xfs libcrc32c sd_mod crc_t10dif crct10dif_generic crct10dif_common ibmvscsi scsi_transport_srp ibmveth scsi_tgt dm_mirror dm_region_hash dm_log dm_mod [ 1917.130805] CPU: 2 PID: 14923 Comm: umount.nfs4 Tainted: G ------------ T 3.10.0-768.el7.ppc64 #1 [ 1917.130810] task: c0000005ecd88040 ti: c00000004cea0000 task.ti: c00000004cea0000 [ 1917.130813] NIP: c000000000354178 LR: c000000000354160 CTR: c00000000012db80 [ 1917.130816] REGS: c00000004cea3720 TRAP: 0700 Tainted: G ------------ T (3.10.0-768.el7.ppc64) [ 1917.130820] MSR: 8000000100029032 CR: 22002822 XER: 20000000 [ 1917.130828] CFAR: c00000000011f594 SOFTE: 1 GPR00: c000000000354160 c00000004cea39a0 c0000000014c4700 c0000000018cc750 GPR04: 000000000000c750 80c0000000000000 0600000000000000 04eeb76bea749a03 GPR08: 0000000000000034 c0000000018cc758 0000000000000001 d000000005e619e8 GPR12: c00000000012db80 c000000007b31200 0000000000000000 0000000000000000 GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 GPR20: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 GPR24: 0000000000000000 c000000000dfc3ec 0000000000000000 c0000005eefc02c0 GPR28: d0000000079dbd50 c0000005b94a02c0 c0000005b94a0250 c0000005b94a01c8 [ 1917.130867] NIP [c000000000354178] .evict+0x1c8/0x350 [ 1917.130871] LR [c000000000354160] .evict+0x1b0/0x350 [ 1917.130873] Call Trace: [ 1917.130876] [c00000004cea39a0] [c000000000354160] .evict+0x1b0/0x350 (unreliable) [ 1917.130880] [c00000004cea3a30] [c0000000003558cc] .evict_inodes+0x13c/0x270 [ 1917.130884] [c00000004cea3af0] [c000000000327d20] .kill_anon_super+0x70/0x1e0 [ 1917.130896] [c00000004cea3b80] [d000000005e43e30] .nfs_kill_super+0x20/0x60 [nfs] [ 1917.130900] [c00000004cea3c00] [c000000000328a20] .deactivate_locked_super+0xa0/0x1b0 [ 1917.130903] [c00000004cea3c80] [c00000000035ba54] .cleanup_mnt+0xd4/0x180 [ 1917.130907] [c00000004cea3d10] [c000000000119034] .task_work_run+0x114/0x150 [ 1917.130912] [c00000004cea3db0] [c00000000001ba6c] .do_notify_resume+0xcc/0x100 [ 1917.130916] [c00000004cea3e30] [c00000000000a7b0] .ret_from_except_lite+0x5c/0x60 [ 1917.130919] Instruction dump: [ 1917.130921] 7fc3f378 486734b5 60000000 387f00a0 38800003 4bdcb365 60000000 e95f00a0 [ 1917.130927] 694a0060 7d4a0074 794ad182 694a0001 <0b0a0000> 892d02a4 2f890000 40de0134 Signed-off-by: Scott Mayhew Cc: stable@vger.kernel.org # 4.5+ Signed-off-by: Anna Schumaker --- fs/nfs/write.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 5b5f464..4a379d7 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1890,6 +1890,8 @@ int nfs_commit_inode(struct inode *inode, int how) if (res) error = nfs_generic_commit_list(inode, &head, how, &cinfo); nfs_commit_end(cinfo.mds); + if (res == 0) + return res; if (error < 0) goto out_error; if (!may_wait) -- cgit v1.1 From f6f3732162b5ae3c771b9285a5a32d72b8586920 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 15 Dec 2017 18:53:22 -0800 Subject: Revert "mm: replace p??_write with pte_access_permitted in fault + gup paths" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commits 5c9d2d5c269c, c7da82b894e9, and e7fe7b5cae90. We'll probably need to revisit this, but basically we should not complicate the get_user_pages_fast() case, and checking the actual page table protection key bits will require more care anyway, since the protection keys depend on the exact state of the VM in question. Particularly when doing a "remote" page lookup (ie in somebody elses VM, not your own), you need to be much more careful than this was. Dave Hansen says: "So, the underlying bug here is that we now a get_user_pages_remote() and then go ahead and do the p*_access_permitted() checks against the current PKRU. This was introduced recently with the addition of the new p??_access_permitted() calls. We have checks in the VMA path for the "remote" gups and we avoid consulting PKRU for them. This got missed in the pkeys selftests because I did a ptrace read, but not a *write*. I also didn't explicitly test it against something where a COW needed to be done" It's also not entirely clear that it makes sense to check the protection key bits at this level at all. But one possible eventual solution is to make the get_user_pages_fast() case just abort if it sees protection key bits set, which makes us fall back to the regular get_user_pages() case, which then has a vma and can do the check there if we want to. We'll see. Somewhat related to this all: what we _do_ want to do some day is to check the PAGE_USER bit - it should obviously always be set for user pages, but it would be a good check to have back. Because we have no generic way to test for it, we lost it as part of moving over from the architecture-specific x86 GUP implementation to the generic one in commit e585513b76f7 ("x86/mm/gup: Switch GUP to the generic get_user_page_fast() implementation"). Cc: Peter Zijlstra Cc: Dan Williams Cc: Dave Hansen Cc: Kirill A. Shutemov Cc: "Jérôme Glisse" Cc: Andrew Morton Cc: Al Viro Signed-off-by: Linus Torvalds --- fs/dax.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/dax.c b/fs/dax.c index 78b72c4..9598159 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -627,8 +627,7 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping, if (pfn != pmd_pfn(*pmdp)) goto unlock_pmd; - if (!pmd_dirty(*pmdp) - && !pmd_access_permitted(*pmdp, WRITE)) + if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp)) goto unlock_pmd; flush_cache_page(vma, address, pfn); -- cgit v1.1