From f5d0684e848f01347ba510545822c205889f8acc Mon Sep 17 00:00:00 2001 From: Sachin Prabhu Date: Thu, 2 Apr 2015 14:51:35 +0100 Subject: cifs: Don't replace dentries for dfs mounts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Doing a readdir on a dfs root can result in the dentries for directories with a dfs share mounted being replaced by new dentries for objects returned by the readdir call. These new dentries on shares mounted with unix extenstions show up as symlinks pointing to the dfs share. # mount -t cifs -o sec=none //vm140-31/dfsroot cifs # stat cifs/testlink/testfile; ls -l cifs File: ‘cifs/testlink/testfile’ Size: 0 Blocks: 0 IO Block: 16384 regular empty file Device: 27h/39d Inode: 130120 Links: 1 Access: (0644/-rw-r--r--) Uid: ( 0/ root) Gid: ( 0/ root) Access: 2015-03-31 13:55:50.106018200 +0100 Modify: 2015-03-31 13:55:50.106018200 +0100 Change: 2015-03-31 13:55:50.106018200 +0100 Birth: - total 0 drwxr-xr-x 2 root root 0 Mar 31 13:54 testdir lrwxrwxrwx 1 root root 19 Mar 24 14:25 testlink -> \vm140-31\test In the example above, the stat command mounts the dfs share at cifs/testlink. The subsequent ls on the dfsroot directory replaces the dentry for testlink with a symlink. In the earlier code, the d_invalidate command returned an -EBUSY error when attempting to invalidate directories. This stopped the code from replacing the directories with symlinks returned by the readdir call. Changes were recently made to the d_invalidate() command so that it no longer returns an error code. This results in the directory with the mounted dfs share being replaced by a symlink which denotes a dfs share. Signed-off-by: Sachin Prabhu Reviewed-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/readdir.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index b4a4723..b1eede3 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -90,6 +90,8 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name, if (dentry) { inode = d_inode(dentry); if (inode) { + if (d_mountpoint(dentry)) + goto out; /* * If we're generating inode numbers, then we don't * want to clobber the existing one with the one that -- cgit v1.1 From bc8ebdc4f54cc944b0ecc0fb0d18b0ffbaab0468 Mon Sep 17 00:00:00 2001 From: Nakajima Akira Date: Fri, 13 Feb 2015 15:35:58 +0900 Subject: Fix that several functions handle incorrect value of mapchars Cifs client has problem with reserved chars filename. [BUG1] : several functions handle incorrect value of mapchars - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + cifs_remap(cifs_sb)); [BUG2] : forget to convert reserved chars when creating SymbolicLink. - CIFSUnixCreateSymLink() calls cifs_strtoUTF16 + CIFSUnixCreateSymLink() calls cifsConvertToUTF16() with remap [BUG3] : forget to convert reserved chars when getting SymbolicLink. - CIFSSMBUnixQuerySymLink() calls cifs_strtoUTF16 + CIFSSMBUnixQuerySymLink() calls cifsConvertToUTF16() with remap [BUG4] : /proc/mounts don't show "mapposix" when using mapposix mount option + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SFM_CHR) + seq_puts(s, ",mapposix"); Reported-by: t.wede@kw-reneg.de Reported-by: Nakajima Akira Signed-off-by: Nakajima Akira Signed-off-by: Carl Schaefer Signed-off-by: Steve French --- fs/cifs/cifs_dfs_ref.c | 3 ++- fs/cifs/cifsfs.c | 2 ++ fs/cifs/cifsproto.h | 4 ++-- fs/cifs/cifssmb.c | 21 +++++++++++---------- fs/cifs/dir.c | 3 +-- fs/cifs/file.c | 3 +-- fs/cifs/inode.c | 6 ++---- fs/cifs/link.c | 3 ++- fs/cifs/smb1ops.c | 3 ++- 9 files changed, 25 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index 430e034..7dc886c 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -24,6 +24,7 @@ #include "cifsfs.h" #include "dns_resolve.h" #include "cifs_debug.h" +#include "cifs_unicode.h" static LIST_HEAD(cifs_dfs_automount_list); @@ -312,7 +313,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt) xid = get_xid(); rc = get_dfs_path(xid, ses, full_path + 1, cifs_sb->local_nls, &num_referrals, &referrals, - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + cifs_remap(cifs_sb)); free_xid(xid); cifs_put_tlink(tlink); diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index f5089bd..0a9fb6b 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -469,6 +469,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root) seq_puts(s, ",nouser_xattr"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR) seq_puts(s, ",mapchars"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SFM_CHR) + seq_puts(s, ",mapposix"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) seq_puts(s, ",sfu"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL) diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index c31ce98..c63fd1d 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -361,11 +361,11 @@ extern int CIFSUnixCreateHardLink(const unsigned int xid, extern int CIFSUnixCreateSymLink(const unsigned int xid, struct cifs_tcon *tcon, const char *fromName, const char *toName, - const struct nls_table *nls_codepage); + const struct nls_table *nls_codepage, int remap); extern int CIFSSMBUnixQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon, const unsigned char *searchName, char **syminfo, - const struct nls_table *nls_codepage); + const struct nls_table *nls_codepage, int remap); extern int CIFSSMBQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid, char **symlinkinfo, const struct nls_table *nls_codepage); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 84650a5..1091aff 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -2784,7 +2784,7 @@ copyRetry: int CIFSUnixCreateSymLink(const unsigned int xid, struct cifs_tcon *tcon, const char *fromName, const char *toName, - const struct nls_table *nls_codepage) + const struct nls_table *nls_codepage, int remap) { TRANSACTION2_SPI_REQ *pSMB = NULL; TRANSACTION2_SPI_RSP *pSMBr = NULL; @@ -2804,9 +2804,9 @@ createSymLinkRetry: if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { name_len = - cifs_strtoUTF16((__le16 *) pSMB->FileName, fromName, - /* find define for this maxpathcomponent */ - PATH_MAX, nls_codepage); + cifsConvertToUTF16((__le16 *) pSMB->FileName, fromName, + /* find define for this maxpathcomponent */ + PATH_MAX, nls_codepage, remap); name_len++; /* trailing null */ name_len *= 2; @@ -2828,9 +2828,9 @@ createSymLinkRetry: data_offset = (char *) (&pSMB->hdr.Protocol) + offset; if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { name_len_target = - cifs_strtoUTF16((__le16 *) data_offset, toName, PATH_MAX - /* find define for this maxpathcomponent */ - , nls_codepage); + cifsConvertToUTF16((__le16 *) data_offset, toName, + /* find define for this maxpathcomponent */ + PATH_MAX, nls_codepage, remap); name_len_target++; /* trailing null */ name_len_target *= 2; } else { /* BB improve the check for buffer overruns BB */ @@ -3034,7 +3034,7 @@ winCreateHardLinkRetry: int CIFSSMBUnixQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon, const unsigned char *searchName, char **symlinkinfo, - const struct nls_table *nls_codepage) + const struct nls_table *nls_codepage, int remap) { /* SMB_QUERY_FILE_UNIX_LINK */ TRANSACTION2_QPI_REQ *pSMB = NULL; @@ -3055,8 +3055,9 @@ querySymLinkRetry: if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { name_len = - cifs_strtoUTF16((__le16 *) pSMB->FileName, searchName, - PATH_MAX, nls_codepage); + cifsConvertToUTF16((__le16 *) pSMB->FileName, + searchName, PATH_MAX, nls_codepage, + remap); name_len++; /* trailing null */ name_len *= 2; } else { /* BB improve the check for buffer overruns BB */ diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 338d569..c3eb998 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -620,8 +620,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode, } rc = CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + cifs_remap(cifs_sb)); if (rc) goto mknod_out; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index cafbf10..5cfa712 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -140,8 +140,7 @@ int cifs_posix_open(char *full_path, struct inode **pinode, posix_flags = cifs_posix_convert_flags(f_flags); rc = CIFSPOSIXCreate(xid, tcon, posix_flags, mode, pnetfid, presp_data, poplock, full_path, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + cifs_remap(cifs_sb)); cifs_put_tlink(tlink); if (rc) diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 55b5811..ef71aa1 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -373,8 +373,7 @@ int cifs_get_inode_info_unix(struct inode **pinode, /* could have done a find first instead but this returns more info */ rc = CIFSSMBUnixQPathInfo(xid, tcon, full_path, &find_data, - cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + cifs_sb->local_nls, cifs_remap(cifs_sb)); cifs_put_tlink(tlink); if (!rc) { @@ -2215,8 +2214,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) pTcon = tlink_tcon(tlink); rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, args, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + cifs_remap(cifs_sb)); cifs_put_tlink(tlink); } diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 252e672..e6c707c 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -717,7 +717,8 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname) rc = create_mf_symlink(xid, pTcon, cifs_sb, full_path, symname); else if (pTcon->unix_ext) rc = CIFSUnixCreateSymLink(xid, pTcon, full_path, symname, - cifs_sb->local_nls); + cifs_sb->local_nls, + cifs_remap(cifs_sb)); /* else rc = CIFSCreateReparseSymLink(xid, pTcon, fromName, toName, cifs_sb_target->local_nls); */ diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index 7bfdd60..fc537c2 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -960,7 +960,8 @@ cifs_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, /* Check for unix extensions */ if (cap_unix(tcon->ses)) { rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, target_path, - cifs_sb->local_nls); + cifs_sb->local_nls, + cifs_remap(cifs_sb)); if (rc == -EREMOTE) rc = cifs_unix_dfs_readlink(xid, tcon, full_path, target_path, -- cgit v1.1 From 6b19687563fa6f385ac314afa30b7579e8c3174c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 6 May 2015 18:26:58 -0400 Subject: nfs: stat(2) fails during cthon04 basic test5 on NFSv4.0 When running the Connectathon basic tests against a Solaris NFS server over NFSv4.0, test5 reports that stat(2) returns a file size of zero instead of 1MB. On success, nfs_commit_inode() can return a positive result; see other call sites such as nfs_file_fsync_commit() and nfs_commit_unstable_pages(). The call site recently added in nfs_wb_all() does not prevent that positive return value from leaking to its callers. If it leaks through nfs_sync_inode() back to nfs_getattr(), that causes stat(2) to return a positive return value to user space while also not filling in the passed-in struct stat. Additional clean up: the new logic in nfs_wb_all() is rewritten in bfields-normal form. Fixes: 5bb89b4702e2 ("NFSv4.1/pnfs: Separate out metadata . . .") Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index d12a4be..dfc19f1 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1845,12 +1845,15 @@ int nfs_wb_all(struct inode *inode) trace_nfs_writeback_inode_enter(inode); ret = filemap_write_and_wait(inode->i_mapping); - if (!ret) { - ret = nfs_commit_inode(inode, FLUSH_SYNC); - if (!ret) - pnfs_sync_inode(inode, true); - } + if (ret) + goto out; + ret = nfs_commit_inode(inode, FLUSH_SYNC); + if (ret < 0) + goto out; + pnfs_sync_inode(inode, true); + ret = 0; +out: trace_nfs_writeback_inode_exit(inode, ret); return ret; } -- cgit v1.1 From feaff8e5b2cfc3eae02cf65db7a400b0b9ffc596 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 12 May 2015 15:48:10 -0400 Subject: nfs: take extra reference to fl->fl_file when running a setlk We had a report of a crash while stress testing the NFS client: BUG: unable to handle kernel NULL pointer dereference at 0000000000000150 IP: [] locks_get_lock_context+0x8/0x90 PGD 0 Oops: 0000 [#1] SMP Modules linked in: rpcsec_gss_krb5 nfsv4 dns_resolver nfs fscache ip6t_rpfilter ip6t_REJECT nf_reject_ipv6 xt_conntrack ebtable_nat ebtable_filter ebtable_broute bridge stp llc ebtables ip6table_security ip6table_mangle ip6table_nat nf_conntrack_ipv6 nf_defrag_ipv6 nf_nat_ipv6 ip6table_raw ip6table_filter ip6_tables iptable_security iptable_mangle iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack iptable_raw coretemp crct10dif_pclmul ppdev crc32_pclmul crc32c_intel ghash_clmulni_intel vmw_balloon serio_raw vmw_vmci i2c_piix4 shpchp parport_pc acpi_cpufreq parport nfsd auth_rpcgss nfs_acl lockd grace sunrpc vmwgfx drm_kms_helper ttm drm mptspi scsi_transport_spi mptscsih mptbase e1000 ata_generic pata_acpi CPU: 1 PID: 399 Comm: kworker/1:1H Not tainted 4.1.0-0.rc1.git0.1.fc23.x86_64 #1 Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/30/2013 Workqueue: rpciod rpc_async_schedule [sunrpc] task: ffff880036aea7c0 ti: ffff8800791f4000 task.ti: ffff8800791f4000 RIP: 0010:[] [] locks_get_lock_context+0x8/0x90 RSP: 0018:ffff8800791f7c00 EFLAGS: 00010293 RAX: ffff8800791f7c40 RBX: ffff88001f2ad8c0 RCX: ffffe8ffffc80305 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 RBP: ffff8800791f7c88 R08: ffff88007fc971d8 R09: 279656d600000000 R10: 0000034a01000000 R11: 279656d600000000 R12: ffff88001f2ad918 R13: ffff88001f2ad8c0 R14: 0000000000000000 R15: 0000000100e73040 FS: 0000000000000000(0000) GS:ffff88007fc80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000150 CR3: 0000000001c0b000 CR4: 00000000000407e0 Stack: ffffffff8127c5b0 ffff8800791f7c18 ffffffffa0171e29 ffff8800791f7c58 ffffffffa0171ef8 ffff8800791f7c78 0000000000000246 ffff88001ea0ba00 ffff8800791f7c40 ffff8800791f7c40 00000000ff5d86a3 ffff8800791f7ca8 Call Trace: [] ? __posix_lock_file+0x40/0x760 [] ? rpc_make_runnable+0x99/0xa0 [sunrpc] [] ? rpc_wake_up_task_queue_locked.part.35+0xc8/0x250 [sunrpc] [] posix_lock_file_wait+0x4a/0x120 [] ? nfs41_wake_and_assign_slot+0x32/0x40 [nfsv4] [] ? nfs41_sequence_done+0xd8/0x2d0 [nfsv4] [] do_vfs_lock+0x2d/0x30 [nfsv4] [] nfs4_lock_done+0x1ad/0x210 [nfsv4] [] ? __rpc_sleep_on_priority+0x390/0x390 [sunrpc] [] ? __rpc_sleep_on_priority+0x390/0x390 [sunrpc] [] rpc_exit_task+0x2c/0xa0 [sunrpc] [] ? call_refreshresult+0x150/0x150 [sunrpc] [] __rpc_execute+0x90/0x460 [sunrpc] [] rpc_async_schedule+0x15/0x20 [sunrpc] [] process_one_work+0x1bb/0x410 [] worker_thread+0x53/0x480 [] ? process_one_work+0x410/0x410 [] ? process_one_work+0x410/0x410 [] kthread+0xd8/0xf0 [] ? kthread_worker_fn+0x180/0x180 [] ret_from_fork+0x42/0x70 [] ? kthread_worker_fn+0x180/0x180 Jean says: "Running locktests with a large number of iterations resulted in a client crash. The test run took a while and hasn't finished after close to 2 hours. The crash happened right after I gave up and killed the test (after 107m) with Ctrl+C." The crash happened because a NULL inode pointer got passed into locks_get_lock_context. The call chain indicates that file_inode(filp) returned NULL, which means that f_inode was NULL. Since that's zeroed out in __fput, that suggests that this filp pointer outlived the last reference. Looking at the code, that seems possible. We copy the struct file_lock that's passed in, but if the task is signalled at an inopportune time we can end up trying to use that file_lock in rpciod context after the process that requested it has already returned (and possibly put its filp reference). Fix this by taking an extra reference to the filp when we allocate the lock info, and put it in nfs4_lock_release. Reported-by: Jean Spector Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 45b35b9..55e1e3a 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -5604,6 +5605,7 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl, p->server = server; atomic_inc(&lsp->ls_count); p->ctx = get_nfs_open_context(ctx); + get_file(fl->fl_file); memcpy(&p->fl, fl, sizeof(p->fl)); return p; out_free_seqid: @@ -5716,6 +5718,7 @@ static void nfs4_lock_release(void *calldata) nfs_free_seqid(data->arg.lock_seqid); nfs4_put_lock_state(data->lsp); put_nfs_open_context(data->ctx); + fput(data->fl.fl_file); kfree(data); dprintk("%s: done!\n", __func__); } -- cgit v1.1 From d377c5eb54dd05aeb3094b7740252d19ba7791f7 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 14 May 2015 10:04:44 +0200 Subject: ovl: don't remove non-empty opaque directory When removing an opaque directory we can't just call rmdir() to check for emptiness, because the directory will need to be replaced with a whiteout. The replacement is done with RENAME_EXCHANGE, which doesn't check emptiness. Solution is just to check emptiness by reading the directory. In the future we could add a new rename flag to check for emptiness even for RENAME_EXCHANGE to optimize this case. Reported-by: Vincent Batts Signed-off-by: Miklos Szeredi Tested-by: Jordi Pujol Palomer Fixes: 263b4a0fee43 ("ovl: dont replace opaque dir") Cc: # v4.0+ --- fs/overlayfs/dir.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index d139405..2578a0c 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -506,11 +506,25 @@ static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir) struct dentry *opaquedir = NULL; int err; - if (is_dir && OVL_TYPE_MERGE_OR_LOWER(ovl_path_type(dentry))) { - opaquedir = ovl_check_empty_and_clear(dentry); - err = PTR_ERR(opaquedir); - if (IS_ERR(opaquedir)) - goto out; + if (is_dir) { + if (OVL_TYPE_MERGE_OR_LOWER(ovl_path_type(dentry))) { + opaquedir = ovl_check_empty_and_clear(dentry); + err = PTR_ERR(opaquedir); + if (IS_ERR(opaquedir)) + goto out; + } else { + LIST_HEAD(list); + + /* + * When removing an empty opaque directory, then it + * makes no sense to replace it with an exact replica of + * itself. But emptiness still needs to be checked. + */ + err = ovl_check_empty_dir(dentry, &list); + ovl_cache_free(&list); + if (err) + goto out; + } } err = ovl_lock_rename_workdir(workdir, upperdir); -- cgit v1.1 From cc6f67bcafcb6bbbb2d1be1603dcd95125a52800 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 19 May 2015 14:30:12 +0200 Subject: ovl: mount read-only if workdir can't be created OpenWRT folks reported that overlayfs fails to mount if upper fs is full, because workdir can't be created. Wordir creation can fail for various other reasons too. There's no reason that the mount itself should fail, overlayfs can work fine without a workdir, as long as the overlay isn't modified. So mount it read-only and don't allow remounting read-write. Add a couple of WARN_ON()s for the impossible case of workdir being used despite being read-only. Reported-by: Bastian Bittorf Signed-off-by: Miklos Szeredi Cc: # v3.18+ --- fs/overlayfs/copy_up.c | 3 +++ fs/overlayfs/dir.c | 9 +++++++++ fs/overlayfs/super.c | 10 +++++----- 3 files changed, 17 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 24f6404..84d693d 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -299,6 +299,9 @@ int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, struct cred *override_cred; char *link = NULL; + if (WARN_ON(!workdir)) + return -EROFS; + ovl_path_upper(parent, &parentpath); upperdir = parentpath.dentry; diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 2578a0c..692ceda 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -222,6 +222,9 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry, struct kstat stat; int err; + if (WARN_ON(!workdir)) + return ERR_PTR(-EROFS); + err = ovl_lock_rename_workdir(workdir, upperdir); if (err) goto out; @@ -322,6 +325,9 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, struct dentry *newdentry; int err; + if (WARN_ON(!workdir)) + return -EROFS; + err = ovl_lock_rename_workdir(workdir, upperdir); if (err) goto out; @@ -506,6 +512,9 @@ static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir) struct dentry *opaquedir = NULL; int err; + if (WARN_ON(!workdir)) + return -EROFS; + if (is_dir) { if (OVL_TYPE_MERGE_OR_LOWER(ovl_path_type(dentry))) { opaquedir = ovl_check_empty_and_clear(dentry); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 5f0d199..bf8537c 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -529,7 +529,7 @@ static int ovl_remount(struct super_block *sb, int *flags, char *data) { struct ovl_fs *ufs = sb->s_fs_info; - if (!(*flags & MS_RDONLY) && !ufs->upper_mnt) + if (!(*flags & MS_RDONLY) && (!ufs->upper_mnt || !ufs->workdir)) return -EROFS; return 0; @@ -925,9 +925,10 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) ufs->workdir = ovl_workdir_create(ufs->upper_mnt, workpath.dentry); err = PTR_ERR(ufs->workdir); if (IS_ERR(ufs->workdir)) { - pr_err("overlayfs: failed to create directory %s/%s\n", - ufs->config.workdir, OVL_WORKDIR_NAME); - goto out_put_upper_mnt; + pr_warn("overlayfs: failed to create directory %s/%s (errno: %i); mounting read-only\n", + ufs->config.workdir, OVL_WORKDIR_NAME, -err); + sb->s_flags |= MS_RDONLY; + ufs->workdir = NULL; } } @@ -997,7 +998,6 @@ out_put_lower_mnt: kfree(ufs->lower_mnt); out_put_workdir: dput(ufs->workdir); -out_put_upper_mnt: mntput(ufs->upper_mnt); out_put_lowerpath: for (i = 0; i < numlower; i++) -- cgit v1.1 From 2c2ed5aa0154c0be67f98c970de6b5587dbc045a Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Tue, 19 May 2015 12:49:50 -0700 Subject: btrfs: clear 'ret' in btrfs_check_shared() loop btrfs_check_shared() is leaking a return value of '1' from find_parent_nodes(). As a result, callers (in this case, extent_fiemap()) are told extents are shared when they are not. This in turn broke fiemap on btrfs for kernels v3.18 and up. The fix is simple - we just have to clear 'ret' after we are done processing the results of find_parent_nodes(). It wasn't clear to me at first what was happening with return values in btrfs_check_shared() and find_parent_nodes() - thanks to Josef for the help on irc. I added documentation to both functions to make things more clear for the next hacker who might come across them. If we could queue this up for -stable too that would be great. Signed-off-by: Mark Fasheh Reviewed-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/backref.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 9de772e..614aaa1 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -880,6 +880,8 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info, * indirect refs to their parent bytenr. * When roots are found, they're added to the roots list * + * NOTE: This can return values > 0 + * * FIXME some caching might speed things up */ static int find_parent_nodes(struct btrfs_trans_handle *trans, @@ -1198,6 +1200,19 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, return ret; } +/** + * btrfs_check_shared - tell us whether an extent is shared + * + * @trans: optional trans handle + * + * btrfs_check_shared uses the backref walking code but will short + * circuit as soon as it finds a root or inode that doesn't match the + * one passed in. This provides a significant performance benefit for + * callers (such as fiemap) which want to know whether the extent is + * shared but do not need a ref count. + * + * Return: 0 if extent is not shared, 1 if it is shared, < 0 on error. + */ int btrfs_check_shared(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 root_objectid, u64 inum, u64 bytenr) @@ -1226,11 +1241,13 @@ int btrfs_check_shared(struct btrfs_trans_handle *trans, ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, tmp, roots, NULL, root_objectid, inum); if (ret == BACKREF_FOUND_SHARED) { + /* this is the only condition under which we return 1 */ ret = 1; break; } if (ret < 0 && ret != -ENOENT) break; + ret = 0; node = ulist_next(tmp, &uiter); if (!node) break; -- cgit v1.1 From a96295965b600f2dc6ad661c4803c86e87db3d7b Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 18 May 2015 19:11:40 +0100 Subject: Btrfs: fix racy system chunk allocation when setting block group ro If while setting a block group read-only we end up allocating a system chunk, through check_system_chunk(), we were not doing it while holding the chunk mutex which is a problem if a concurrent chunk allocation is happening, through do_chunk_alloc(), as it means both block groups can end up using the same logical addresses and physical regions in the device(s). So make sure we hold the chunk mutex. Cc: stable@vger.kernel.org # 4.0+ Fixes: 2f0810880f08 ("btrfs: delete chunk allocation attemp when setting block group ro") Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 2 ++ fs/btrfs/volumes.c | 1 + 2 files changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 7effed6..45e3f08 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -8842,7 +8842,9 @@ again: out: if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) { alloc_flags = update_block_group_flags(root, cache->flags); + lock_chunks(root->fs_info->chunk_root); check_system_chunk(trans, root, alloc_flags); + unlock_chunks(root->fs_info->chunk_root); } mutex_unlock(&root->fs_info->ro_block_group_mutex); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 96aebf3..174f5e1 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4625,6 +4625,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, { u64 chunk_offset; + ASSERT(mutex_is_locked(&extent_root->fs_info->chunk_mutex)); chunk_offset = find_next_chunk(extent_root->fs_info); return __btrfs_alloc_chunk(trans, extent_root, chunk_offset, type); } -- cgit v1.1 From 1dc92c450a53f120b67296cb4b29c1dfdc665ac1 Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 20 May 2015 09:32:21 -0500 Subject: [cifs] fix null pointer check Dan Carpenter pointed out an inconsistent null pointer check in smb2_hdr_assemble that was pointed out by static checker. Signed-off-by: Steve French Reviewed-by: Sachin Prabhu CC: Dan Carpenter w --- fs/cifs/smb2pdu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 65cd7a8..54cbe19 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -110,7 +110,7 @@ smb2_hdr_assemble(struct smb2_hdr *hdr, __le16 smb2_cmd /* command */ , /* GLOBAL_CAP_LARGE_MTU will only be set if dialect > SMB2.02 */ /* See sections 2.2.4 and 3.2.4.1.5 of MS-SMB2 */ - if ((tcon->ses) && + if ((tcon->ses) && (tcon->ses->server) && (tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU)) hdr->CreditCharge = cpu_to_le16(1); /* else CreditCharge MBZ */ -- cgit v1.1 From 65c3b205ebe06a389a29544e71a8071da0ce4155 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 30 Apr 2015 17:30:24 +0300 Subject: CIFS: remove an unneeded NULL check Smatch complains because we dereference "ses->server" without checking some lines earlier inside the call to get_next_mid(ses->server). fs/cifs/cifssmb.c:4921 CIFSGetDFSRefer() warn: variable dereferenced before check 'ses->server' (see line 4899) There is only one caller for this function get_dfs_path() and it always passes a non-null "ses->server" pointer so this NULL check can be removed. Signed-off-by: Dan Carpenter Signed-off-by: Steve French --- fs/cifs/cifssmb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 1091aff..f26ffbf 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -4918,7 +4918,7 @@ getDFSRetry: strncpy(pSMB->RequestFileName, search_name, name_len); } - if (ses->server && ses->server->sign) + if (ses->server->sign) pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE; pSMB->hdr.Uid = ses->Suid; -- cgit v1.1 From 153c35b6cccc0c72de9fae06c8e2c8b2c47d79d4 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 19 May 2015 18:54:41 -0700 Subject: Btrfs: fix regression in raid level conversion Commit 2f0810880f082fa8ba66ab2c33b02e4ff9770a5e changed btrfs_set_block_group_ro to avoid trying to allocate new chunks with the new raid profile during conversion. This fixed failures when there was no space on the drive to allocate a new chunk, but the metadata reserves were sufficient to continue the conversion. But this ended up causing a regression when the drive had plenty of space to allocate new chunks, mostly because reduce_alloc_profile isn't using the new raid profile. Fixing btrfs_reduce_alloc_profile is a bigger patch. For now, do a partial revert of 2f0810880, and don't error out if we hit ENOSPC. Signed-off-by: Chris Mason Tested-by: Dave Sterba Reported-by: Holger Hoffstaette --- fs/btrfs/extent-tree.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 45e3f08..0ec3acd 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -8829,6 +8829,24 @@ again: goto again; } + /* + * if we are changing raid levels, try to allocate a corresponding + * block group with the new raid level. + */ + alloc_flags = update_block_group_flags(root, cache->flags); + if (alloc_flags != cache->flags) { + ret = do_chunk_alloc(trans, root, alloc_flags, + CHUNK_ALLOC_FORCE); + /* + * ENOSPC is allowed here, we may have enough space + * already allocated at the new raid level to + * carry on + */ + if (ret == -ENOSPC) + ret = 0; + if (ret < 0) + goto out; + } ret = set_block_group_ro(cache, 0); if (!ret) -- cgit v1.1 From 7196ac113a4f38b7ca1a3282fd9edf328bd22287 Mon Sep 17 00:00:00 2001 From: Nakajima Akira Date: Wed, 22 Apr 2015 15:24:44 +0900 Subject: Fix to check Unique id and FileType when client refer file directly. When you refer file directly on cifs client, (e.g. ls -li , cd , stat ) the function return old inode number and filetype from old inode cache, though server has different inode number or filetype. When server is Windows, cifs client has same problem. When Server is Windows , This patch fixes bug in different filetype, but does not fix bug in different inode number. Because QUERY_PATH_INFO response by Windows does not include inode number(Index Number) . BUG INFO https://bugzilla.kernel.org/show_bug.cgi?id=90021 https://bugzilla.kernel.org/show_bug.cgi?id=90031 Reported-by: Nakajima Akira Signed-off-by: Nakajima Akira Reviewed-by: Shirish Pargaonkar Signed-off-by: Steve French --- fs/cifs/inode.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'fs') diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index ef71aa1..f621b44 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -401,9 +401,25 @@ int cifs_get_inode_info_unix(struct inode **pinode, rc = -ENOMEM; } else { /* we already have inode, update it */ + + /* if uniqueid is different, return error */ + if (unlikely(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM && + CIFS_I(*pinode)->uniqueid != fattr.cf_uniqueid)) { + rc = -ESTALE; + goto cgiiu_exit; + } + + /* if filetype is different, return error */ + if (unlikely(((*pinode)->i_mode & S_IFMT) != + (fattr.cf_mode & S_IFMT))) { + rc = -ESTALE; + goto cgiiu_exit; + } + cifs_fattr_to_inode(*pinode, &fattr); } +cgiiu_exit: return rc; } @@ -838,6 +854,15 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, if (!*inode) rc = -ENOMEM; } else { + /* we already have inode, update it */ + + /* if filetype is different, return error */ + if (unlikely(((*inode)->i_mode & S_IFMT) != + (fattr.cf_mode & S_IFMT))) { + rc = -ESTALE; + goto cgii_exit; + } + cifs_fattr_to_inode(*inode, &fattr); } -- cgit v1.1 From 00b8c95b680791a72b4bb14dc371ff1f1daae39c Mon Sep 17 00:00:00 2001 From: Chengyu Song Date: Tue, 24 Mar 2015 20:18:49 -0400 Subject: cifs: potential missing check for posix_lock_file_wait posix_lock_file_wait may fail under certain circumstances, and its result is usually checked/returned. But given the complexity of cifs, I'm not sure if the result is intentially left unchecked and always expected to succeed. Signed-off-by: Chengyu Song Acked-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 5cfa712..3f50cee 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1552,8 +1552,8 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type, rc = server->ops->mand_unlock_range(cfile, flock, xid); out: - if (flock->fl_flags & FL_POSIX) - posix_lock_file_wait(file, flock); + if (flock->fl_flags & FL_POSIX && !rc) + rc = posix_lock_file_wait(file, flock); return rc; } -- cgit v1.1 From b29103076bec8316e155e71309dc0fba499022c6 Mon Sep 17 00:00:00 2001 From: Nakajima Akira Date: Thu, 9 Apr 2015 17:27:39 +0900 Subject: Fix to convert SURROGATE PAIR Garbled characters happen by using surrogate pair for filename. (replace each 1 character to ??) [Steps to Reproduce for bug] client# touch $(echo -e '\xf0\x9d\x9f\xa3') client# touch $(echo -e '\xf0\x9d\x9f\xa4') client# ls -li You see same inode number, same filename(=?? and ??) . Fix the bug about these functions do not consider about surrogate pair (and IVS). cifs_utf16_bytes() cifs_mapchar() cifs_from_utf16() cifsConvertToUTF16() Reported-by: Nakajima Akira Signed-off-by: Nakajima Akira Signed-off-by: Steve French --- fs/cifs/cifs_unicode.c | 182 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 136 insertions(+), 46 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c index 0303c67..5a53ac6 100644 --- a/fs/cifs/cifs_unicode.c +++ b/fs/cifs/cifs_unicode.c @@ -27,41 +27,6 @@ #include "cifsglob.h" #include "cifs_debug.h" -/* - * cifs_utf16_bytes - how long will a string be after conversion? - * @utf16 - pointer to input string - * @maxbytes - don't go past this many bytes of input string - * @codepage - destination codepage - * - * Walk a utf16le string and return the number of bytes that the string will - * be after being converted to the given charset, not including any null - * termination required. Don't walk past maxbytes in the source buffer. - */ -int -cifs_utf16_bytes(const __le16 *from, int maxbytes, - const struct nls_table *codepage) -{ - int i; - int charlen, outlen = 0; - int maxwords = maxbytes / 2; - char tmp[NLS_MAX_CHARSET_SIZE]; - __u16 ftmp; - - for (i = 0; i < maxwords; i++) { - ftmp = get_unaligned_le16(&from[i]); - if (ftmp == 0) - break; - - charlen = codepage->uni2char(ftmp, tmp, NLS_MAX_CHARSET_SIZE); - if (charlen > 0) - outlen += charlen; - else - outlen++; - } - - return outlen; -} - int cifs_remap(struct cifs_sb_info *cifs_sb) { int map_type; @@ -155,10 +120,13 @@ convert_sfm_char(const __u16 src_char, char *target) * enough to hold the result of the conversion (at least NLS_MAX_CHARSET_SIZE). */ static int -cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp, +cifs_mapchar(char *target, const __u16 *from, const struct nls_table *cp, int maptype) { int len = 1; + __u16 src_char; + + src_char = *from; if ((maptype == SFM_MAP_UNI_RSVD) && convert_sfm_char(src_char, target)) return len; @@ -168,10 +136,23 @@ cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp, /* if character not one of seven in special remap set */ len = cp->uni2char(src_char, target, NLS_MAX_CHARSET_SIZE); - if (len <= 0) { - *target = '?'; - len = 1; - } + if (len <= 0) + goto surrogate_pair; + + return len; + +surrogate_pair: + /* convert SURROGATE_PAIR and IVS */ + if (strcmp(cp->charset, "utf8")) + goto unknown; + len = utf16s_to_utf8s(from, 3, UTF16_LITTLE_ENDIAN, target, 6); + if (len <= 0) + goto unknown; + return len; + +unknown: + *target = '?'; + len = 1; return len; } @@ -206,7 +187,7 @@ cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen, int nullsize = nls_nullsize(codepage); int fromwords = fromlen / 2; char tmp[NLS_MAX_CHARSET_SIZE]; - __u16 ftmp; + __u16 ftmp[3]; /* ftmp[3] = 3array x 2bytes = 6bytes UTF-16 */ /* * because the chars can be of varying widths, we need to take care @@ -217,9 +198,17 @@ cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen, safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize); for (i = 0; i < fromwords; i++) { - ftmp = get_unaligned_le16(&from[i]); - if (ftmp == 0) + ftmp[0] = get_unaligned_le16(&from[i]); + if (ftmp[0] == 0) break; + if (i + 1 < fromwords) + ftmp[1] = get_unaligned_le16(&from[i + 1]); + else + ftmp[1] = 0; + if (i + 2 < fromwords) + ftmp[2] = get_unaligned_le16(&from[i + 2]); + else + ftmp[2] = 0; /* * check to see if converting this character might make the @@ -234,6 +223,17 @@ cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen, /* put converted char into 'to' buffer */ charlen = cifs_mapchar(&to[outlen], ftmp, codepage, map_type); outlen += charlen; + + /* charlen (=bytes of UTF-8 for 1 character) + * 4bytes UTF-8(surrogate pair) is charlen=4 + * (4bytes UTF-16 code) + * 7-8bytes UTF-8(IVS) is charlen=3+4 or 4+4 + * (2 UTF-8 pairs divided to 2 UTF-16 pairs) */ + if (charlen == 4) + i++; + else if (charlen >= 5) + /* 5-6bytes UTF-8 */ + i += 2; } /* properly null-terminate string */ @@ -296,6 +296,46 @@ success: } /* + * cifs_utf16_bytes - how long will a string be after conversion? + * @utf16 - pointer to input string + * @maxbytes - don't go past this many bytes of input string + * @codepage - destination codepage + * + * Walk a utf16le string and return the number of bytes that the string will + * be after being converted to the given charset, not including any null + * termination required. Don't walk past maxbytes in the source buffer. + */ +int +cifs_utf16_bytes(const __le16 *from, int maxbytes, + const struct nls_table *codepage) +{ + int i; + int charlen, outlen = 0; + int maxwords = maxbytes / 2; + char tmp[NLS_MAX_CHARSET_SIZE]; + __u16 ftmp[3]; + + for (i = 0; i < maxwords; i++) { + ftmp[0] = get_unaligned_le16(&from[i]); + if (ftmp[0] == 0) + break; + if (i + 1 < maxwords) + ftmp[1] = get_unaligned_le16(&from[i + 1]); + else + ftmp[1] = 0; + if (i + 2 < maxwords) + ftmp[2] = get_unaligned_le16(&from[i + 2]); + else + ftmp[2] = 0; + + charlen = cifs_mapchar(tmp, ftmp, codepage, NO_MAP_UNI_RSVD); + outlen += charlen; + } + + return outlen; +} + +/* * cifs_strndup_from_utf16 - copy a string from wire format to the local * codepage * @src - source string @@ -409,10 +449,15 @@ cifsConvertToUTF16(__le16 *target, const char *source, int srclen, char src_char; __le16 dst_char; wchar_t tmp; + wchar_t *wchar_to; /* UTF-16 */ + int ret; + unicode_t u; if (map_chars == NO_MAP_UNI_RSVD) return cifs_strtoUTF16(target, source, PATH_MAX, cp); + wchar_to = kzalloc(6, GFP_KERNEL); + for (i = 0; i < srclen; j++) { src_char = source[i]; charlen = 1; @@ -441,11 +486,55 @@ cifsConvertToUTF16(__le16 *target, const char *source, int srclen, * if no match, use question mark, which at least in * some cases serves as wild card */ - if (charlen < 1) { - dst_char = cpu_to_le16(0x003f); - charlen = 1; + if (charlen > 0) + goto ctoUTF16; + + /* convert SURROGATE_PAIR */ + if (strcmp(cp->charset, "utf8") || !wchar_to) + goto unknown; + if (*(source + i) & 0x80) { + charlen = utf8_to_utf32(source + i, 6, &u); + if (charlen < 0) + goto unknown; + } else + goto unknown; + ret = utf8s_to_utf16s(source + i, charlen, + UTF16_LITTLE_ENDIAN, + wchar_to, 6); + if (ret < 0) + goto unknown; + + i += charlen; + dst_char = cpu_to_le16(*wchar_to); + if (charlen <= 3) + /* 1-3bytes UTF-8 to 2bytes UTF-16 */ + put_unaligned(dst_char, &target[j]); + else if (charlen == 4) { + /* 4bytes UTF-8(surrogate pair) to 4bytes UTF-16 + * 7-8bytes UTF-8(IVS) divided to 2 UTF-16 + * (charlen=3+4 or 4+4) */ + put_unaligned(dst_char, &target[j]); + dst_char = cpu_to_le16(*(wchar_to + 1)); + j++; + put_unaligned(dst_char, &target[j]); + } else if (charlen >= 5) { + /* 5-6bytes UTF-8 to 6bytes UTF-16 */ + put_unaligned(dst_char, &target[j]); + dst_char = cpu_to_le16(*(wchar_to + 1)); + j++; + put_unaligned(dst_char, &target[j]); + dst_char = cpu_to_le16(*(wchar_to + 2)); + j++; + put_unaligned(dst_char, &target[j]); } + continue; + +unknown: + dst_char = cpu_to_le16(0x003f); + charlen = 1; } + +ctoUTF16: /* * character may take more than one byte in the source string, * but will take exactly two bytes in the target string @@ -456,6 +545,7 @@ cifsConvertToUTF16(__le16 *target, const char *source, int srclen, ctoUTF16_out: put_unaligned(0, &target[j]); /* Null terminate target unicode string */ + kfree(wchar_to); return j; } -- cgit v1.1 From 4afe260bab50290a05e5732570329a530ed023f3 Mon Sep 17 00:00:00 2001 From: Federico Sauter Date: Tue, 17 Mar 2015 17:45:28 +0100 Subject: CIFS: Fix race condition on RFC1002_NEGATIVE_SESSION_RESPONSE This patch fixes a race condition that occurs when connecting to a NT 3.51 host without specifying a NetBIOS name. In that case a RFC1002_NEGATIVE_SESSION_RESPONSE is received and the SMB negotiation is reattempted, but under some conditions it leads SendReceive() to hang forever while waiting for srv_mutex. This, in turn, sets the calling process to an uninterruptible sleep state and makes it unkillable. The solution is to unlock the srv_mutex acquired in the demux thread *before* going to sleep (after the reconnect error) and before reattempting the connection. --- fs/cifs/connect.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index f3bfe08..8383d5e 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -386,6 +386,7 @@ cifs_reconnect(struct TCP_Server_Info *server) rc = generic_ip_connect(server); if (rc) { cifs_dbg(FYI, "reconnect error %d\n", rc); + mutex_unlock(&server->srv_mutex); msleep(3000); } else { atomic_inc(&tcpSesReconnectCount); @@ -393,8 +394,8 @@ cifs_reconnect(struct TCP_Server_Info *server) if (server->tcpStatus != CifsExiting) server->tcpStatus = CifsNeedNegotiate; spin_unlock(&GlobalMid_Lock); + mutex_unlock(&server->srv_mutex); } - mutex_unlock(&server->srv_mutex); } while (server->tcpStatus == CifsNeedReconnect); return rc; -- cgit v1.1 From 74f9ce1cf2830b94e189f4e99678dbf19aa3bc90 Mon Sep 17 00:00:00 2001 From: George Wang Date: Fri, 29 May 2015 07:39:34 +1000 Subject: xfs: use percpu_counter_read_positive for mp->m_icount Function percpu_counter_read just return the current counter, which can be negative. This will cause the checking of "allocated inode counts <= m_maxicount" false positive. Use percpu_counter_read_positive can solve this problem, and be consistent with the purpose to introduce percpu mechanism to xfs. Signed-off-by: George Wang Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_ialloc.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 07349a1..1c9e755 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -376,7 +376,7 @@ xfs_ialloc_ag_alloc( */ newlen = args.mp->m_ialloc_inos; if (args.mp->m_maxicount && - percpu_counter_read(&args.mp->m_icount) + newlen > + percpu_counter_read_positive(&args.mp->m_icount) + newlen > args.mp->m_maxicount) return -ENOSPC; args.minlen = args.maxlen = args.mp->m_ialloc_blks; @@ -1339,10 +1339,13 @@ xfs_dialloc( * If we have already hit the ceiling of inode blocks then clear * okalloc so we scan all available agi structures for a free * inode. + * + * Read rough value of mp->m_icount by percpu_counter_read_positive, + * which will sacrifice the preciseness but improve the performance. */ if (mp->m_maxicount && - percpu_counter_read(&mp->m_icount) + mp->m_ialloc_inos > - mp->m_maxicount) { + percpu_counter_read_positive(&mp->m_icount) + mp->m_ialloc_inos + > mp->m_maxicount) { noroom = 1; okalloc = 0; } -- cgit v1.1 From 8c1903d3081ab7c416f1bbc7905f37a265d5e2e9 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 29 May 2015 07:39:34 +1000 Subject: xfs: inode and free block counters need to use __percpu_counter_compare Because the counters use a custom batch size, the comparison functions need to be aware of that batch size otherwise the comparison does not work correctly. This leads to ASSERT failures on generic/027 like this: XFS: Assertion failed: 0, file: fs/xfs/xfs_mount.c, line: 1099 ------------[ cut here ]------------ .... Call Trace: [] xfs_mod_icount+0x99/0xc0 [] xfs_trans_unreserve_and_mod_sb+0x28b/0x5b0 [] xfs_log_commit_cil+0x321/0x580 [] xfs_trans_commit+0xb7/0x260 [] xfs_bmap_finish+0xcd/0x1b0 [] xfs_inactive_ifree+0x1e1/0x250 [] xfs_inactive+0x130/0x200 [] xfs_fs_evict_inode+0x91/0xf0 [] evict+0xb8/0x190 [] iput+0x18b/0x1f0 [] do_unlinkat+0x1f3/0x320 [] ? filp_close+0x5a/0x80 [] SyS_unlinkat+0x1b/0x40 [] system_call_fastpath+0x12/0x71 This is a regression introduced by commit 501ab32 ("xfs: use generic percpu counters for inode counter"). This patch fixes the same problem for both the inode counter and the free block counter in the superblocks. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_mount.c | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 2ce7ee3..6f23fbd 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1084,14 +1084,18 @@ xfs_log_sbcount(xfs_mount_t *mp) return xfs_sync_sb(mp, true); } +/* + * Deltas for the inode count are +/-64, hence we use a large batch size + * of 128 so we don't need to take the counter lock on every update. + */ +#define XFS_ICOUNT_BATCH 128 int xfs_mod_icount( struct xfs_mount *mp, int64_t delta) { - /* deltas are +/-64, hence the large batch size of 128. */ - __percpu_counter_add(&mp->m_icount, delta, 128); - if (percpu_counter_compare(&mp->m_icount, 0) < 0) { + __percpu_counter_add(&mp->m_icount, delta, XFS_ICOUNT_BATCH); + if (__percpu_counter_compare(&mp->m_icount, 0, XFS_ICOUNT_BATCH) < 0) { ASSERT(0); percpu_counter_add(&mp->m_icount, -delta); return -EINVAL; @@ -1113,6 +1117,14 @@ xfs_mod_ifree( return 0; } +/* + * Deltas for the block count can vary from 1 to very large, but lock contention + * only occurs on frequent small block count updates such as in the delayed + * allocation path for buffered writes (page a time updates). Hence we set + * a large batch count (1024) to minimise global counter updates except when + * we get near to ENOSPC and we have to be very accurate with our updates. + */ +#define XFS_FDBLOCKS_BATCH 1024 int xfs_mod_fdblocks( struct xfs_mount *mp, @@ -1151,25 +1163,19 @@ xfs_mod_fdblocks( * Taking blocks away, need to be more accurate the closer we * are to zero. * - * batch size is set to a maximum of 1024 blocks - if we are - * allocating of freeing extents larger than this then we aren't - * going to be hammering the counter lock so a lock per update - * is not a problem. - * * If the counter has a value of less than 2 * max batch size, * then make everything serialise as we are real close to * ENOSPC. */ -#define __BATCH 1024 - if (percpu_counter_compare(&mp->m_fdblocks, 2 * __BATCH) < 0) + if (__percpu_counter_compare(&mp->m_fdblocks, 2 * XFS_FDBLOCKS_BATCH, + XFS_FDBLOCKS_BATCH) < 0) batch = 1; else - batch = __BATCH; -#undef __BATCH + batch = XFS_FDBLOCKS_BATCH; __percpu_counter_add(&mp->m_fdblocks, delta, batch); - if (percpu_counter_compare(&mp->m_fdblocks, - XFS_ALLOC_SET_ASIDE(mp)) >= 0) { + if (__percpu_counter_compare(&mp->m_fdblocks, XFS_ALLOC_SET_ASIDE(mp), + XFS_FDBLOCKS_BATCH) >= 0) { /* we had space! */ return 0; } -- cgit v1.1 From 6dea405eee9e5e73eb54a1dbcc5b65047113cd92 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 29 May 2015 07:40:06 +1000 Subject: xfs: extent size hints can round up extents past MAXEXTLEN This results in BMBT corruption, as seen by this test: # mkfs.xfs -f -d size=40051712b,agcount=4 /dev/vdc .... # mount /dev/vdc /mnt/scratch # xfs_io -ft -c "extsize 16m" -c "falloc 0 30g" -c "bmap -vp" /mnt/scratch/foo which results in this failure on a debug kernel: XFS: Assertion failed: (blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0, file: fs/xfs/libxfs/xfs_bmap_btree.c, line: 211 .... Call Trace: [] xfs_bmbt_set_allf+0x8f/0x100 [] xfs_bmbt_set_all+0x1d/0x20 [] xfs_iext_insert+0x9e/0x120 [] ? xfs_bmap_add_extent_hole_real+0x1c6/0xc70 [] xfs_bmap_add_extent_hole_real+0x1c6/0xc70 [] xfs_bmapi_write+0x72b/0xed0 [] ? kmem_cache_alloc+0x15c/0x170 [] xfs_alloc_file_space+0x160/0x400 [] ? down_write+0x29/0x60 [] xfs_file_fallocate+0x29b/0x310 [] ? __sb_start_write+0x58/0x120 [] ? do_vfs_ioctl+0x318/0x570 [] vfs_fallocate+0x140/0x260 [] SyS_fallocate+0x48/0x80 [] system_call_fastpath+0x12/0x17 The tracepoint that indicates the extent that triggered the assert failure is: xfs_iext_insert: idx 0 offset 0 block 16777224 count 2097152 flag 1 Clearly indicating that the extent length is greater than MAXEXTLEN, which is 2097151. A prior trace point shows the allocation was an exact size match and that a length greater than MAXEXTLEN was asked for: xfs_alloc_size_done: agno 1 agbno 8 minlen 2097152 maxlen 2097152 ^^^^^^^ ^^^^^^^ We don't see this problem with extent size hints through the IO path because we can't do single IOs large enough to trigger MAXEXTLEN allocation. fallocate(), OTOH, is not limited in it's allocation sizes and so needs help here. The issue is that the extent size hint alignment is rounding up the extent size past MAXEXTLEN, because xfs_bmapi_write() is not taking into account extent size hints when calculating the maximum extent length to allocate. xfs_bmapi_reserve_delalloc() is already doing this, but direct extent allocation is not. Unfortunately, the calculation in xfs_bmapi_reserve_delalloc() is wrong, and it works only because delayed allocation extents are not limited in size to MAXEXTLEN in the in-core extent tree. hence this calculation does not work for direct allocation, and the delalloc code needs fixing. This may, in fact be the underlying bug that occassionally causes transaction overruns in delayed allocation extent conversion, so now we know it's wrong we should fix it, too. Many thanks to Brian Foster for finding this problem during review of this patch. Hence the fix, after much code reading, is to allow xfs_bmap_extsize_align() to align partial extents when full alignment would extend the alignment past MAXEXTLEN. We can safely do this because all callers have higher layer allocation loops that already handle short allocations, and so will simply run another allocation to cover the remainder of the requested allocation range that we ignored during alignment. The advantage of this approach is that it also removes the need for callers to do anything other than limit their requests to MAXEXTLEN - they don't really need to be aware of extent size hints at all. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_bmap.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index aeffeaa..f1026e8 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -3224,12 +3224,24 @@ xfs_bmap_extsize_align( align_alen += temp; align_off -= temp; } + + /* Same adjustment for the end of the requested area. */ + temp = (align_alen % extsz); + if (temp) + align_alen += extsz - temp; + /* - * Same adjustment for the end of the requested area. + * For large extent hint sizes, the aligned extent might be larger than + * MAXEXTLEN. In that case, reduce the size by an extsz so that it pulls + * the length back under MAXEXTLEN. The outer allocation loops handle + * short allocation just fine, so it is safe to do this. We only want to + * do it when we are forced to, though, because it means more allocation + * operations are required. */ - if ((temp = (align_alen % extsz))) { - align_alen += extsz - temp; - } + while (align_alen > MAXEXTLEN) + align_alen -= extsz; + ASSERT(align_alen <= MAXEXTLEN); + /* * If the previous block overlaps with this proposed allocation * then move the start forward without adjusting the length. @@ -3318,7 +3330,9 @@ xfs_bmap_extsize_align( return -EINVAL; } else { ASSERT(orig_off >= align_off); - ASSERT(orig_end <= align_off + align_alen); + /* see MAXEXTLEN handling above */ + ASSERT(orig_end <= align_off + align_alen || + align_alen + extsz > MAXEXTLEN); } #ifdef DEBUG @@ -4099,13 +4113,6 @@ xfs_bmapi_reserve_delalloc( /* Figure out the extent size, adjust alen */ extsz = xfs_get_extsz_hint(ip); if (extsz) { - /* - * Make sure we don't exceed a single extent length when we - * align the extent by reducing length we are going to - * allocate by the maximum amount extent size aligment may - * require. - */ - alen = XFS_FILBLKS_MIN(len, MAXEXTLEN - (2 * extsz - 1)); error = xfs_bmap_extsize_align(mp, got, prev, extsz, rt, eof, 1, 0, &aoff, &alen); ASSERT(!error); -- cgit v1.1 From 6dfe5a049f2d48582050339d2a6b6fda36dfd14c Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 29 May 2015 07:40:08 +1000 Subject: xfs: xfs_attr_inactive leaves inconsistent attr fork state behind xfs_attr_inactive() is supposed to clean up the attribute fork when the inode is being freed. While it removes attribute fork extents, it completely ignores attributes in local format, which means that there can still be active attributes on the inode after xfs_attr_inactive() has run. This leads to problems with concurrent inode writeback - the in-core inode attribute fork is removed without locking on the assumption that nothing will be attempting to access the attribute fork after a call to xfs_attr_inactive() because it isn't supposed to exist on disk any more. To fix this, make xfs_attr_inactive() completely remove all traces of the attribute fork from the inode, regardless of it's state. Further, also remove the in-core attribute fork structure safely so that there is nothing further that needs to be done by callers to clean up the attribute fork. This means we can remove the in-core and on-disk attribute forks atomically. Also, on error simply remove the in-memory attribute fork. There's nothing that can be done with it once we have failed to remove the on-disk attribute fork, so we may as well just blow it away here anyway. cc: # 3.12 to 4.0 Reported-by: Waiman Long Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr_leaf.c | 8 ++--- fs/xfs/libxfs/xfs_attr_leaf.h | 2 +- fs/xfs/xfs_attr_inactive.c | 83 +++++++++++++++++++++++++------------------ fs/xfs/xfs_inode.c | 12 +++---- 4 files changed, 58 insertions(+), 47 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 04e79d5..e9d401c 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -574,8 +574,8 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff) * After the last attribute is removed revert to original inode format, * making all literal area available to the data fork once more. */ -STATIC void -xfs_attr_fork_reset( +void +xfs_attr_fork_remove( struct xfs_inode *ip, struct xfs_trans *tp) { @@ -641,7 +641,7 @@ xfs_attr_shortform_remove(xfs_da_args_t *args) (mp->m_flags & XFS_MOUNT_ATTR2) && (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) && !(args->op_flags & XFS_DA_OP_ADDNAME)) { - xfs_attr_fork_reset(dp, args->trans); + xfs_attr_fork_remove(dp, args->trans); } else { xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); dp->i_d.di_forkoff = xfs_attr_shortform_bytesfit(dp, totsize); @@ -905,7 +905,7 @@ xfs_attr3_leaf_to_shortform( if (forkoff == -1) { ASSERT(dp->i_mount->m_flags & XFS_MOUNT_ATTR2); ASSERT(dp->i_d.di_format != XFS_DINODE_FMT_BTREE); - xfs_attr_fork_reset(dp, args->trans); + xfs_attr_fork_remove(dp, args->trans); goto out; } diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index 025c4b8..882c8d3 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -53,7 +53,7 @@ int xfs_attr_shortform_remove(struct xfs_da_args *args); int xfs_attr_shortform_list(struct xfs_attr_list_context *context); int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp); int xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes); - +void xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp); /* * Internal routines when attribute fork size == XFS_LBSIZE(mp). diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index f9c1c64..3fbf167 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -380,23 +380,31 @@ xfs_attr3_root_inactive( return error; } +/* + * xfs_attr_inactive kills all traces of an attribute fork on an inode. It + * removes both the on-disk and in-memory inode fork. Note that this also has to + * handle the condition of inodes without attributes but with an attribute fork + * configured, so we can't use xfs_inode_hasattr() here. + * + * The in-memory attribute fork is removed even on error. + */ int -xfs_attr_inactive(xfs_inode_t *dp) +xfs_attr_inactive( + struct xfs_inode *dp) { - xfs_trans_t *trans; - xfs_mount_t *mp; - int error; + struct xfs_trans *trans; + struct xfs_mount *mp; + int cancel_flags = 0; + int lock_mode = XFS_ILOCK_SHARED; + int error = 0; mp = dp->i_mount; ASSERT(! XFS_NOT_DQATTACHED(mp, dp)); - xfs_ilock(dp, XFS_ILOCK_SHARED); - if (!xfs_inode_hasattr(dp) || - dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { - xfs_iunlock(dp, XFS_ILOCK_SHARED); - return 0; - } - xfs_iunlock(dp, XFS_ILOCK_SHARED); + xfs_ilock(dp, lock_mode); + if (!XFS_IFORK_Q(dp)) + goto out_destroy_fork; + xfs_iunlock(dp, lock_mode); /* * Start our first transaction of the day. @@ -408,13 +416,18 @@ xfs_attr_inactive(xfs_inode_t *dp) * the inode in every transaction to let it float upward through * the log. */ + lock_mode = 0; trans = xfs_trans_alloc(mp, XFS_TRANS_ATTRINVAL); error = xfs_trans_reserve(trans, &M_RES(mp)->tr_attrinval, 0, 0); - if (error) { - xfs_trans_cancel(trans, 0); - return error; - } - xfs_ilock(dp, XFS_ILOCK_EXCL); + if (error) + goto out_cancel; + + lock_mode = XFS_ILOCK_EXCL; + cancel_flags = XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT; + xfs_ilock(dp, lock_mode); + + if (!XFS_IFORK_Q(dp)) + goto out_cancel; /* * No need to make quota reservations here. We expect to release some @@ -422,29 +435,31 @@ xfs_attr_inactive(xfs_inode_t *dp) */ xfs_trans_ijoin(trans, dp, 0); - /* - * Decide on what work routines to call based on the inode size. - */ - if (!xfs_inode_hasattr(dp) || - dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { - error = 0; - goto out; + /* invalidate and truncate the attribute fork extents */ + if (dp->i_d.di_aformat != XFS_DINODE_FMT_LOCAL) { + error = xfs_attr3_root_inactive(&trans, dp); + if (error) + goto out_cancel; + + error = xfs_itruncate_extents(&trans, dp, XFS_ATTR_FORK, 0); + if (error) + goto out_cancel; } - error = xfs_attr3_root_inactive(&trans, dp); - if (error) - goto out; - error = xfs_itruncate_extents(&trans, dp, XFS_ATTR_FORK, 0); - if (error) - goto out; + /* Reset the attribute fork - this also destroys the in-core fork */ + xfs_attr_fork_remove(dp, trans); error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES); - xfs_iunlock(dp, XFS_ILOCK_EXCL); - + xfs_iunlock(dp, lock_mode); return error; -out: - xfs_trans_cancel(trans, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); - xfs_iunlock(dp, XFS_ILOCK_EXCL); +out_cancel: + xfs_trans_cancel(trans, cancel_flags); +out_destroy_fork: + /* kill the in-core attr fork before we drop the inode lock */ + if (dp->i_afp) + xfs_idestroy_fork(dp, XFS_ATTR_FORK); + if (lock_mode) + xfs_iunlock(dp, lock_mode); return error; } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index d6ebc85..1117dd3 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1946,21 +1946,17 @@ xfs_inactive( /* * If there are attributes associated with the file then blow them away * now. The code calls a routine that recursively deconstructs the - * attribute fork. We need to just commit the current transaction - * because we can't use it for xfs_attr_inactive(). + * attribute fork. If also blows away the in-core attribute fork. */ - if (ip->i_d.di_anextents > 0) { - ASSERT(ip->i_d.di_forkoff != 0); - + if (XFS_IFORK_Q(ip)) { error = xfs_attr_inactive(ip); if (error) return; } - if (ip->i_afp) - xfs_idestroy_fork(ip, XFS_ATTR_FORK); - + ASSERT(!ip->i_afp); ASSERT(ip->i_d.di_anextents == 0); + ASSERT(ip->i_d.di_forkoff == 0); /* * Free the inode. -- cgit v1.1 From cddc116228cb9d51d3224d23ba3e61fbbc3ec3d2 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 29 May 2015 07:40:32 +1000 Subject: xfs: xfs_iozero can return positive errno It was missed when we converted everything in XFs to use negative error numbers, so fix it now. Bug introduced in 3.17 by commit 2451337 ("xfs: global error sign conversion"), and should go back to stable kernels. Thanks to Brian Foster for noticing it. cc: # 3.17, 3.18, 3.19, 4.0 Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 8121e75..3b75912 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -124,7 +124,7 @@ xfs_iozero( status = 0; } while (count); - return (-status); + return status; } int -- cgit v1.1 From 22419ac9fe5e79483596cebdbd1d1209c18bac1a Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 29 May 2015 08:14:55 +1000 Subject: xfs: fix broken i_nlink accounting for whiteout tmpfile inode XFS uses the internal tmpfile() infrastructure for the whiteout inode used for RENAME_WHITEOUT operations. For tmpfile inodes, XFS allocates the inode, drops di_nlink, adds the inode to the agi unlinked list, calls d_tmpfile() which correspondingly drops i_nlink of the vfs inode, and then finishes the common inode setup (e.g., clear I_NEW and unlock). The d_tmpfile() call was originally made inxfs_create_tmpfile(), but was pulled up out of that function as part of the following commit to resolve a deadlock issue: 330033d6 xfs: fix tmpfile/selinux deadlock and initialize security As a result, callers of xfs_create_tmpfile() are responsible for either calling d_tmpfile() or fixing up i_nlink appropriately. The whiteout tmpfile allocation helper does neither. As a result, the vfs ->i_nlink becomes inconsistent with the on-disk ->di_nlink once xfs_rename() links it back into the source dentry and calls xfs_bumplink(). Update the assert in xfs_rename() to help detect this problem in the future and update xfs_rename_alloc_whiteout() to decrement the link count as part of the manual tmpfile inode setup. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_inode.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 1117dd3..539a85f 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2879,7 +2879,13 @@ xfs_rename_alloc_whiteout( if (error) return error; - /* Satisfy xfs_bumplink that this is a real tmpfile */ + /* + * Prepare the tmpfile inode as if it were created through the VFS. + * Otherwise, the link increment paths will complain about nlink 0->1. + * Drop the link count as done by d_tmpfile(), complete the inode setup + * and flag it as linkable. + */ + drop_nlink(VFS_I(tmpfile)); xfs_finish_inode_setup(tmpfile); VFS_I(tmpfile)->i_state |= I_LINKABLE; @@ -3147,7 +3153,7 @@ xfs_rename( * intermediate state on disk. */ if (wip) { - ASSERT(wip->i_d.di_nlink == 0); + ASSERT(VFS_I(wip)->i_nlink == 0 && wip->i_d.di_nlink == 0); error = xfs_bumplink(tp, wip); if (error) goto out_trans_abort; -- cgit v1.1 From 2b1d3ae940acd11be44c6eced5873d47c2e00ffa Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 28 May 2015 15:44:24 -0700 Subject: fs/binfmt_elf.c:load_elf_binary(): return -EINVAL on zero-length mappings load_elf_binary() returns `retval', not `error'. Fixes: a87938b2e246b81b4fb ("fs/binfmt_elf.c: fix bug in loading of PIE binaries") Reported-by: James Hogan Cc: Michael Davidson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_elf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 241ef68..cd46e41 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -918,7 +918,7 @@ static int load_elf_binary(struct linux_binprm *bprm) total_size = total_mapping_size(elf_phdata, loc->elf_ex.e_phnum); if (!total_size) { - error = -EINVAL; + retval = -EINVAL; goto out_free_dentry; } } -- cgit v1.1 From dcbff39da3d815f08750552fdd04f96b51751129 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Thu, 28 May 2015 15:44:29 -0700 Subject: fs, omfs: add NULL terminator in the end up the token list match_token() expects a NULL terminator at the end of the token list so that it would know where to stop. Not having one causes it to overrun to invalid memory. In practice, passing a mount option that omfs didn't recognize would sometimes panic the system. Signed-off-by: Sasha Levin Signed-off-by: Bob Copeland Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/omfs/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index 138321b..70d4191 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -359,7 +359,7 @@ nomem: } enum { - Opt_uid, Opt_gid, Opt_umask, Opt_dmask, Opt_fmask + Opt_uid, Opt_gid, Opt_umask, Opt_dmask, Opt_fmask, Opt_err }; static const match_table_t tokens = { @@ -368,6 +368,7 @@ static const match_table_t tokens = { {Opt_umask, "umask=%o"}, {Opt_dmask, "dmask=%o"}, {Opt_fmask, "fmask=%o"}, + {Opt_err, NULL}, }; static int parse_options(char *options, struct omfs_sb_info *sbi) -- cgit v1.1 From 3a281f946640542a7a7372f436e101ee1fbc4c97 Mon Sep 17 00:00:00 2001 From: Bob Copeland Date: Thu, 28 May 2015 15:44:32 -0700 Subject: omfs: set error return when d_make_root() fails A static checker found the following issue in the error path for omfs_fill_super: fs/omfs/inode.c:552 omfs_fill_super() warn: missing error code here? 'd_make_root()' failed. 'ret' = '0' Fix by returning -ENOMEM in this case. Signed-off-by: Bob Copeland Reported-by: Dan Carpenter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/omfs/inode.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index 70d4191..6ce542c 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -549,8 +549,10 @@ static int omfs_fill_super(struct super_block *sb, void *data, int silent) } sb->s_root = d_make_root(root); - if (!sb->s_root) + if (!sb->s_root) { + ret = -ENOMEM; goto out_brelse_bh2; + } printk(KERN_DEBUG "omfs: Mounted volume %s\n", omfs_rb->r_name); ret = 0; -- cgit v1.1 From c0345ee57d461343586b5e1e2f9c3c3766d07fe6 Mon Sep 17 00:00:00 2001 From: Bob Copeland Date: Thu, 28 May 2015 15:44:35 -0700 Subject: omfs: fix sign confusion for bitmap loop counter The count variable is used to iterate down to (below) zero from the size of the bitmap and handle the one-filling the remainder of the last partial bitmap block. The loop conditional expects count to be signed in order to detect when the final block is processed, after which count goes negative. Unfortunately, a recent change made this unsigned along with some other related fields. The result of is this is that during mount, omfs_get_imap will overrun the bitmap array and corrupt memory unless number of blocks happens to be a multiple of 8 * blocksize. Fix by changing count back to signed: it is guaranteed to fit in an s32 without overflow due to an enforced limit on the number of blocks in the filesystem. Signed-off-by: Bob Copeland Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/omfs/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index 6ce542c..3d935c8 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -306,7 +306,8 @@ static const struct super_operations omfs_sops = { */ static int omfs_get_imap(struct super_block *sb) { - unsigned int bitmap_size, count, array_size; + unsigned int bitmap_size, array_size; + int count; struct omfs_sb_info *sbi = OMFS_SB(sb); struct buffer_head *bh; unsigned long **ptr; -- cgit v1.1 From 5a6b2b36a8249ec9dfb2a714acadad86d9cc0aee Mon Sep 17 00:00:00 2001 From: Bob Copeland Date: Thu, 28 May 2015 15:44:37 -0700 Subject: omfs: fix potential integer overflow in allocator Both 'i' and 'bits_per_entry' are signed integers but the result is a u64 block number. Cast i to u64 to avoid truncation on 32-bit targets. Found by Coverity (CID 200679). Signed-off-by: Bob Copeland Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/omfs/bitmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/omfs/bitmap.c b/fs/omfs/bitmap.c index 0822345..83f4e76 100644 --- a/fs/omfs/bitmap.c +++ b/fs/omfs/bitmap.c @@ -159,7 +159,7 @@ int omfs_allocate_range(struct super_block *sb, goto out; found: - *return_block = i * bits_per_entry + bit; + *return_block = (u64) i * bits_per_entry + bit; *return_size = run; ret = set_run(sb, i, bits_per_entry, bit, run, 1); -- cgit v1.1 From 2159184ea01e4ae7d15f2017e296d4bc82d5aeb0 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 28 May 2015 23:09:19 -0400 Subject: d_walk() might skip too much when we find that a child has died while we'd been trying to ascend, we should go into the first live sibling itself, rather than its sibling. Off-by-one in question had been introduced in "deal with deadlock in d_walk()" and the fix needs to be backported to all branches this one has been backported to. Cc: stable@vger.kernel.org # 3.2 and later Signed-off-by: Al Viro --- fs/dcache.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index 656ce52..37b5afd 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1239,13 +1239,13 @@ ascend: /* might go back up the wrong parent if we have had a rename. */ if (need_seqretry(&rename_lock, seq)) goto rename_retry; - next = child->d_child.next; - while (unlikely(child->d_flags & DCACHE_DENTRY_KILLED)) { + /* go into the first sibling still alive */ + do { + next = child->d_child.next; if (next == &this_parent->d_subdirs) goto ascend; child = list_entry(next, struct dentry, d_child); - next = next->next; - } + } while (unlikely(child->d_flags & DCACHE_DENTRY_KILLED)); rcu_read_unlock(); goto resume; } -- cgit v1.1 From 161f873b89136eb1e69477c847d5a5033239d9ba Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Wed, 28 Jan 2015 15:30:43 -0500 Subject: vfs: read file_handle only once in handle_to_path We used to read file_handle twice. Once to get the amount of extra bytes, and once to fetch the entire structure. This may be problematic since we do size verifications only after the first read, so if the number of extra bytes changes in userspace between the first and second calls, we'll have an incoherent view of file_handle. Instead, read the constant size once, and copy that over to the final structure without having to re-read it again. Signed-off-by: Sasha Levin Cc: Al Viro Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- fs/fhandle.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fhandle.c b/fs/fhandle.c index 999ff5c..d59712d 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -195,8 +195,9 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh, goto out_err; } /* copy the full handle */ - if (copy_from_user(handle, ufh, - sizeof(struct file_handle) + + *handle = f_handle; + if (copy_from_user(&handle->f_handle, + &ufh->f_handle, f_handle.handle_bytes)) { retval = -EFAULT; goto out_handle; -- cgit v1.1