From c7848f69ec4a8c03732cde5c949bd2aa711a9f4b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 4 Dec 2013 17:39:23 -0500 Subject: NFSv4: OPEN must handle the NFS4ERR_IO return code correctly decode_op_hdr() cannot distinguish between an XDR decoding error and the perfectly valid errorcode NFS4ERR_IO. This is normally not a problem, but for the particular case of OPEN, we need to be able to increment the NFSv4 open sequence id when the server returns a valid response. Reported-by: J Bruce Fields Link: http://lkml.kernel.org/r/20131204210356.GA19452@fieldses.org Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org --- fs/nfs/nfs4xdr.c | 47 +++++++++++++++++++++++++++++++---------------- 1 file changed, 31 insertions(+), 16 deletions(-) (limited to 'fs/nfs') diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 5be2868..8c21d69 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -3097,7 +3097,8 @@ out_overflow: return -EIO; } -static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected) +static bool __decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected, + int *nfs_retval) { __be32 *p; uint32_t opnum; @@ -3107,19 +3108,32 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected) if (unlikely(!p)) goto out_overflow; opnum = be32_to_cpup(p++); - if (opnum != expected) { - dprintk("nfs: Server returned operation" - " %d but we issued a request for %d\n", - opnum, expected); - return -EIO; - } + if (unlikely(opnum != expected)) + goto out_bad_operation; nfserr = be32_to_cpup(p); - if (nfserr != NFS_OK) - return nfs4_stat_to_errno(nfserr); - return 0; + if (nfserr == NFS_OK) + *nfs_retval = 0; + else + *nfs_retval = nfs4_stat_to_errno(nfserr); + return true; +out_bad_operation: + dprintk("nfs: Server returned operation" + " %d but we issued a request for %d\n", + opnum, expected); + *nfs_retval = -EREMOTEIO; + return false; out_overflow: print_overflow_msg(__func__, xdr); - return -EIO; + *nfs_retval = -EIO; + return false; +} + +static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected) +{ + int retval; + + __decode_op_hdr(xdr, expected, &retval); + return retval; } /* Dummy routine */ @@ -5001,11 +5015,12 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) uint32_t savewords, bmlen, i; int status; - status = decode_op_hdr(xdr, OP_OPEN); - if (status != -EIO) - nfs_increment_open_seqid(status, res->seqid); - if (!status) - status = decode_stateid(xdr, &res->stateid); + if (!__decode_op_hdr(xdr, OP_OPEN, &status)) + return status; + nfs_increment_open_seqid(status, res->seqid); + if (status) + return status; + status = decode_stateid(xdr, &res->stateid); if (unlikely(status)) return status; -- cgit v1.1 From 6aa23d76a7b549521a03b63b6d5b7880ea87eab7 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 14 Nov 2013 07:25:19 -0500 Subject: nfs: check if gssd is running before attempting to use krb5i auth in SETCLIENTID call Currently, the client will attempt to use krb5i in the SETCLIENTID call even if rpc.gssd isn't running. When that fails, it'll then fall back to RPC_AUTH_UNIX. This introduced a delay when mounting if rpc.gssd isn't running, and causes warning messages to pop up in the ring buffer. Check to see if rpc.gssd is running before even attempting to use krb5i auth, and just silently skip trying to do so if it isn't. In the event that the admin is actually trying to mount with krb5*, it will still fail at a later stage of the mount attempt. Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust --- fs/nfs/nfs4client.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs/nfs') diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index b4a160a..c1b7a80 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "internal.h" #include "callback.h" #include "delegation.h" @@ -370,7 +371,11 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp, __set_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags); __set_bit(NFS_CS_DISCRTRY, &clp->cl_flags); __set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags); - error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_GSS_KRB5I); + + error = -EINVAL; + if (gssd_running(clp->cl_net)) + error = nfs_create_rpc_client(clp, timeparms, + RPC_AUTH_GSS_KRB5I); if (error == -EINVAL) error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX); if (error < 0) -- cgit v1.1 From a8c2275493b866961f4429a741251c630c4fc6d7 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Sat, 21 Dec 2013 05:39:04 +0100 Subject: nfs: fix dead code of ipv6_addr_scope The correct way to check on IPV6_ADDR_SCOPE_LINKLOCAL is to check with the ipv6_addr_src_scope function. Currently this can't be work, because ipv6_addr_scope returns a int with a mask of IPV6_ADDR_SCOPE_MASK (0x00f0U) and IPV6_ADDR_SCOPE_LINKLOCAL is 0x02. So the condition is always false. Signed-off-by: Alexander Aring Signed-off-by: Trond Myklebust --- fs/nfs/nfs4filelayoutdev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/nfs') diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c index c7c295e5..efac602 100644 --- a/fs/nfs/nfs4filelayoutdev.c +++ b/fs/nfs/nfs4filelayoutdev.c @@ -95,7 +95,7 @@ same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2) b6 = (struct sockaddr_in6 *)addr2; /* LINKLOCAL addresses must have matching scope_id */ - if (ipv6_addr_scope(&a6->sin6_addr) == + if (ipv6_addr_src_scope(&a6->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL && a6->sin6_scope_id != b6->sin6_scope_id) return false; -- cgit v1.1 From 1e8968c5b0582392d5f132422f581e3ebc24e627 Mon Sep 17 00:00:00 2001 From: Niels de Vos Date: Tue, 17 Dec 2013 18:20:16 +0100 Subject: NFS: dprintk() should not print negative fileids and inode numbers A fileid in NFS is a uint64. There are some occurrences where dprintk() outputs a signed fileid. This leads to confusion and more difficult to read debugging (negative fileids matching positive inode numbers). Signed-off-by: Niels de Vos CC: Santosh Pradhan Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 18 +++++++++--------- fs/nfs/direct.c | 4 ++-- fs/nfs/file.c | 6 +++--- fs/nfs/inode.c | 29 +++++++++++++++-------------- fs/nfs/nfs3acl.c | 4 ++-- fs/nfs/nfs4filelayout.c | 8 ++++---- fs/nfs/read.c | 12 ++++++------ fs/nfs/write.c | 8 ++++---- 8 files changed, 45 insertions(+), 44 deletions(-) (limited to 'fs/nfs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 812154a..b266f73 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1404,7 +1404,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, /* Expect a negative dentry */ BUG_ON(dentry->d_inode); - dfprintk(VFS, "NFS: atomic_open(%s/%ld), %pd\n", + dfprintk(VFS, "NFS: atomic_open(%s/%lu), %pd\n", dir->i_sb->s_id, dir->i_ino, dentry); err = nfs_check_flags(open_flags); @@ -1594,7 +1594,7 @@ int nfs_create(struct inode *dir, struct dentry *dentry, int open_flags = excl ? O_CREAT | O_EXCL : O_CREAT; int error; - dfprintk(VFS, "NFS: create(%s/%ld), %pd\n", + dfprintk(VFS, "NFS: create(%s/%lu), %pd\n", dir->i_sb->s_id, dir->i_ino, dentry); attr.ia_mode = mode; @@ -1621,7 +1621,7 @@ nfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) struct iattr attr; int status; - dfprintk(VFS, "NFS: mknod(%s/%ld), %pd\n", + dfprintk(VFS, "NFS: mknod(%s/%lu), %pd\n", dir->i_sb->s_id, dir->i_ino, dentry); if (!new_valid_dev(rdev)) @@ -1650,7 +1650,7 @@ int nfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) struct iattr attr; int error; - dfprintk(VFS, "NFS: mkdir(%s/%ld), %pd\n", + dfprintk(VFS, "NFS: mkdir(%s/%lu), %pd\n", dir->i_sb->s_id, dir->i_ino, dentry); attr.ia_valid = ATTR_MODE; @@ -1678,7 +1678,7 @@ int nfs_rmdir(struct inode *dir, struct dentry *dentry) { int error; - dfprintk(VFS, "NFS: rmdir(%s/%ld), %pd\n", + dfprintk(VFS, "NFS: rmdir(%s/%lu), %pd\n", dir->i_sb->s_id, dir->i_ino, dentry); trace_nfs_rmdir_enter(dir, dentry); @@ -1747,7 +1747,7 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry) int error; int need_rehash = 0; - dfprintk(VFS, "NFS: unlink(%s/%ld, %pd)\n", dir->i_sb->s_id, + dfprintk(VFS, "NFS: unlink(%s/%lu, %pd)\n", dir->i_sb->s_id, dir->i_ino, dentry); trace_nfs_unlink_enter(dir, dentry); @@ -1798,7 +1798,7 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) unsigned int pathlen = strlen(symname); int error; - dfprintk(VFS, "NFS: symlink(%s/%ld, %pd, %s)\n", dir->i_sb->s_id, + dfprintk(VFS, "NFS: symlink(%s/%lu, %pd, %s)\n", dir->i_sb->s_id, dir->i_ino, dentry, symname); if (pathlen > PAGE_SIZE) @@ -1821,7 +1821,7 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) error = NFS_PROTO(dir)->symlink(dir, dentry, page, pathlen, &attr); trace_nfs_symlink_exit(dir, dentry, error); if (error != 0) { - dfprintk(VFS, "NFS: symlink(%s/%ld, %pd, %s) error %d\n", + dfprintk(VFS, "NFS: symlink(%s/%lu, %pd, %s) error %d\n", dir->i_sb->s_id, dir->i_ino, dentry, symname, error); d_drop(dentry); @@ -2304,7 +2304,7 @@ out: if (!res && (mask & MAY_EXEC) && !execute_ok(inode)) res = -EACCES; - dfprintk(VFS, "NFS: permission(%s/%ld), mask=0x%x, res=%d\n", + dfprintk(VFS, "NFS: permission(%s/%lu), mask=0x%x, res=%d\n", inode->i_sb->s_id, inode->i_ino, mask, res); return res; out_notsup: diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index d71d66c..049b340 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -237,9 +237,9 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq) static void nfs_direct_readpage_release(struct nfs_page *req) { - dprintk("NFS: direct read done (%s/%lld %d@%lld)\n", + dprintk("NFS: direct read done (%s/%llu %d@%lld)\n", req->wb_context->dentry->d_inode->i_sb->s_id, - (long long)NFS_FILEID(req->wb_context->dentry->d_inode), + (unsigned long long)NFS_FILEID(req->wb_context->dentry->d_inode), req->wb_bytes, (long long)req_offset(req)); nfs_release_request(req); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index e2fcacf..5bb790a 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -354,7 +354,7 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping, struct page *page; int once_thru = 0; - dfprintk(PAGECACHE, "NFS: write_begin(%pD2(%ld), %u@%lld)\n", + dfprintk(PAGECACHE, "NFS: write_begin(%pD2(%lu), %u@%lld)\n", file, mapping->host->i_ino, len, (long long) pos); start: @@ -395,7 +395,7 @@ static int nfs_write_end(struct file *file, struct address_space *mapping, struct nfs_open_context *ctx = nfs_file_open_context(file); int status; - dfprintk(PAGECACHE, "NFS: write_end(%pD2(%ld), %u@%lld)\n", + dfprintk(PAGECACHE, "NFS: write_end(%pD2(%lu), %u@%lld)\n", file, mapping->host->i_ino, len, (long long) pos); /* @@ -585,7 +585,7 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) int ret = VM_FAULT_NOPAGE; struct address_space *mapping; - dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%pD2(%ld), offset %lld)\n", + dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%pD2(%lu), offset %lld)\n", filp, filp->f_mapping->host->i_ino, (long long)page_offset(page)); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 00ad1c2..5feec23 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -458,9 +458,9 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st unlock_new_inode(inode); } else nfs_refresh_inode(inode, fattr); - dprintk("NFS: nfs_fhget(%s/%Ld fh_crc=0x%08x ct=%d)\n", + dprintk("NFS: nfs_fhget(%s/%Lu fh_crc=0x%08x ct=%d)\n", inode->i_sb->s_id, - (long long)NFS_FILEID(inode), + (unsigned long long)NFS_FILEID(inode), nfs_display_fhandle_hash(fh), atomic_read(&inode->i_count)); @@ -870,8 +870,8 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) struct nfs_fattr *fattr = NULL; struct nfs_inode *nfsi = NFS_I(inode); - dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n", - inode->i_sb->s_id, (long long)NFS_FILEID(inode)); + dfprintk(PAGECACHE, "NFS: revalidating (%s/%Lu)\n", + inode->i_sb->s_id, (unsigned long long)NFS_FILEID(inode)); trace_nfs_revalidate_inode_enter(inode); @@ -895,9 +895,9 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr, label); if (status != 0) { - dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n", + dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Lu) getattr failed, error=%d\n", inode->i_sb->s_id, - (long long)NFS_FILEID(inode), status); + (unsigned long long)NFS_FILEID(inode), status); if (status == -ESTALE) { nfs_zap_caches(inode); if (!S_ISDIR(inode->i_mode)) @@ -908,9 +908,9 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) status = nfs_refresh_inode(inode, fattr); if (status) { - dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n", + dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Lu) refresh failed, error=%d\n", inode->i_sb->s_id, - (long long)NFS_FILEID(inode), status); + (unsigned long long)NFS_FILEID(inode), status); goto err_out; } @@ -919,9 +919,9 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) nfs_setsecurity(inode, fattr, label); - dfprintk(PAGECACHE, "NFS: (%s/%Ld) revalidation complete\n", + dfprintk(PAGECACHE, "NFS: (%s/%Lu) revalidation complete\n", inode->i_sb->s_id, - (long long)NFS_FILEID(inode)); + (unsigned long long)NFS_FILEID(inode)); err_out: nfs4_label_free(label); @@ -985,8 +985,9 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE); nfs_fscache_wait_on_invalidate(inode); - dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n", - inode->i_sb->s_id, (long long)NFS_FILEID(inode)); + dfprintk(PAGECACHE, "NFS: (%s/%Lu) data cache invalidated\n", + inode->i_sb->s_id, + (unsigned long long)NFS_FILEID(inode)); return 0; } @@ -1434,7 +1435,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) unsigned long now = jiffies; unsigned long save_cache_validity; - dfprintk(VFS, "NFS: %s(%s/%ld fh_crc=0x%08x ct=%d info=0x%x)\n", + dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%x)\n", __func__, inode->i_sb->s_id, inode->i_ino, nfs_display_fhandle_hash(NFS_FH(inode)), atomic_read(&inode->i_count), fattr->valid); @@ -1455,7 +1456,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) /* * Big trouble! The inode has become a different object. */ - printk(KERN_DEBUG "NFS: %s: inode %ld mode changed, %07o to %07o\n", + printk(KERN_DEBUG "NFS: %s: inode %lu mode changed, %07o to %07o\n", __func__, inode->i_ino, inode->i_mode, fattr->mode); goto out_err; } diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 4a1aafb..34e918d 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -130,7 +130,7 @@ static void __nfs3_forget_cached_acls(struct nfs_inode *nfsi) void nfs3_forget_cached_acls(struct inode *inode) { - dprintk("NFS: nfs3_forget_cached_acls(%s/%ld)\n", inode->i_sb->s_id, + dprintk("NFS: nfs3_forget_cached_acls(%s/%lu)\n", inode->i_sb->s_id, inode->i_ino); spin_lock(&inode->i_lock); __nfs3_forget_cached_acls(NFS_I(inode)); @@ -161,7 +161,7 @@ static struct posix_acl *nfs3_get_cached_acl(struct inode *inode, int type) acl = posix_acl_dup(acl); out: spin_unlock(&inode->i_lock); - dprintk("NFS: nfs3_get_cached_acl(%s/%ld, %d) = %p\n", inode->i_sb->s_id, + dprintk("NFS: nfs3_get_cached_acl(%s/%lu, %d) = %p\n", inode->i_sb->s_id, inode->i_ino, type, acl); return acl; } diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index b86464b..0a93e79 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -91,10 +91,10 @@ static void filelayout_reset_write(struct nfs_write_data *data) if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { dprintk("%s Reset task %5u for i/o through MDS " - "(req %s/%lld, %u bytes @ offset %llu)\n", __func__, + "(req %s/%llu, %u bytes @ offset %llu)\n", __func__, data->task.tk_pid, hdr->inode->i_sb->s_id, - (long long)NFS_FILEID(hdr->inode), + (unsigned long long)NFS_FILEID(hdr->inode), data->args.count, (unsigned long long)data->args.offset); @@ -112,10 +112,10 @@ static void filelayout_reset_read(struct nfs_read_data *data) if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { dprintk("%s Reset task %5u for i/o through MDS " - "(req %s/%lld, %u bytes @ offset %llu)\n", __func__, + "(req %s/%llu, %u bytes @ offset %llu)\n", __func__, data->task.tk_pid, hdr->inode->i_sb->s_id, - (long long)NFS_FILEID(hdr->inode), + (unsigned long long)NFS_FILEID(hdr->inode), data->args.count, (unsigned long long)data->args.offset); diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 31db5c3..411aedd 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -163,9 +163,9 @@ static void nfs_readpage_release(struct nfs_page *req) unlock_page(req->wb_page); - dprintk("NFS: read done (%s/%Ld %d@%Ld)\n", + dprintk("NFS: read done (%s/%Lu %d@%Ld)\n", req->wb_context->dentry->d_inode->i_sb->s_id, - (long long)NFS_FILEID(req->wb_context->dentry->d_inode), + (unsigned long long)NFS_FILEID(req->wb_context->dentry->d_inode), req->wb_bytes, (long long)req_offset(req)); nfs_release_request(req); @@ -228,11 +228,11 @@ int nfs_initiate_read(struct rpc_clnt *clnt, /* Set up the initial task struct. */ NFS_PROTO(inode)->read_setup(data, &msg); - dprintk("NFS: %5u initiated read call (req %s/%lld, %u bytes @ " + dprintk("NFS: %5u initiated read call (req %s/%llu, %u bytes @ " "offset %llu)\n", data->task.tk_pid, inode->i_sb->s_id, - (long long)NFS_FILEID(inode), + (unsigned long long)NFS_FILEID(inode), data->args.count, (unsigned long long)data->args.offset); @@ -630,9 +630,9 @@ int nfs_readpages(struct file *filp, struct address_space *mapping, unsigned long npages; int ret = -ESTALE; - dprintk("NFS: nfs_readpages (%s/%Ld %d)\n", + dprintk("NFS: nfs_readpages (%s/%Lu %d)\n", inode->i_sb->s_id, - (long long)NFS_FILEID(inode), + (unsigned long long)NFS_FILEID(inode), nr_pages); nfs_inc_stats(inode, NFSIOS_VFSREADPAGES); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index c1d5482..77a00c6 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1013,10 +1013,10 @@ int nfs_initiate_write(struct rpc_clnt *clnt, NFS_PROTO(inode)->write_setup(data, &msg); dprintk("NFS: %5u initiated write call " - "(req %s/%lld, %u bytes @ offset %llu)\n", + "(req %s/%llu, %u bytes @ offset %llu)\n", data->task.tk_pid, inode->i_sb->s_id, - (long long)NFS_FILEID(inode), + (unsigned long long)NFS_FILEID(inode), data->args.count, (unsigned long long)data->args.offset); @@ -1606,9 +1606,9 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) nfs_list_remove_request(req); nfs_clear_page_commit(req->wb_page); - dprintk("NFS: commit (%s/%lld %d@%lld)", + dprintk("NFS: commit (%s/%llu %d@%lld)", req->wb_context->dentry->d_sb->s_id, - (long long)NFS_FILEID(req->wb_context->dentry->d_inode), + (unsigned long long)NFS_FILEID(req->wb_context->dentry->d_inode), req->wb_bytes, (long long)req_offset(req)); if (status < 0) { -- cgit v1.1 From 16a6ddc70920a0686dbf90e092a539c1a4fd7b77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toralf=20F=C3=B6rster?= Date: Thu, 12 Dec 2013 19:09:00 +0100 Subject: point to the right include file in a comment (left over from a9004abc3) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Toralf Förster Signed-off-by: Trond Myklebust --- fs/nfs/nfs4state.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/nfs') diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 059c01b..e5be725 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1071,7 +1071,7 @@ void nfs_free_seqid(struct nfs_seqid *seqid) /* * Increment the seqid if the OPEN/OPEN_DOWNGRADE/CLOSE succeeded, or * failed with a seqid incrementing error - - * see comments nfs_fs.h:seqid_mutating_error() + * see comments nfs4.h:seqid_mutating_error() */ static void nfs_increment_seqid(int status, struct nfs_seqid *seqid) { @@ -1116,7 +1116,7 @@ void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid) /* * Increment the seqid if the LOCK/LOCKU succeeded, or * failed with a seqid incrementing error - - * see comments nfs_fs.h:seqid_mutating_error() + * see comments nfs4.h:seqid_mutating_error() */ void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid) { -- cgit v1.1 From d8c951c313ed1d7144b55c0d56f7c53220044dda Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 13 Jan 2014 12:08:11 -0500 Subject: NFSv4.1: Don't trust attributes if a pNFS LAYOUTCOMMIT is outstanding If a LAYOUTCOMMIT is outstanding, then chances are that the metadata server may still be returning incorrect values for the change attribute, ctime, mtime and/or size. Just ignore those attributes for now, and wait for the LAYOUTCOMMIT rpc call to finish. Reported-by: shaobingqing Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 19 +++++++++++++++++-- fs/nfs/nfs4proc.c | 5 ++--- fs/nfs/pnfs.h | 16 ++++++++++++++++ 3 files changed, 35 insertions(+), 5 deletions(-) (limited to 'fs/nfs') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 5feec23..c63e152 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1283,12 +1283,28 @@ static int nfs_inode_attrs_need_update(const struct inode *inode, const struct n ((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0); } +/* + * Don't trust the change_attribute, mtime, ctime or size if + * a pnfs LAYOUTCOMMIT is outstanding + */ +static void nfs_inode_attrs_handle_layoutcommit(struct inode *inode, + struct nfs_fattr *fattr) +{ + if (pnfs_layoutcommit_outstanding(inode)) + fattr->valid &= ~(NFS_ATTR_FATTR_CHANGE | + NFS_ATTR_FATTR_MTIME | + NFS_ATTR_FATTR_CTIME | + NFS_ATTR_FATTR_SIZE); +} + static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr) { int ret; trace_nfs_refresh_inode_enter(inode); + nfs_inode_attrs_handle_layoutcommit(inode, fattr); + if (nfs_inode_attrs_need_update(inode, fattr)) ret = nfs_update_inode(inode, fattr); else @@ -1518,8 +1534,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) if (new_isize != cur_isize) { /* Do we perhaps have any outstanding writes, or has * the file grown beyond our last write? */ - if ((nfsi->npages == 0 && !test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) || - new_isize > cur_isize) { + if ((nfsi->npages == 0) || new_isize > cur_isize) { i_size_write(inode, new_isize); invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 15052b8..f4908eb 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -7780,10 +7780,7 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata) case -NFS4ERR_BADLAYOUT: /* no layout */ case -NFS4ERR_GRACE: /* loca_recalim always false */ task->tk_status = 0; - break; case 0: - nfs_post_op_update_inode_force_wcc(data->args.inode, - data->res.fattr); break; default: if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) { @@ -7798,6 +7795,8 @@ static void nfs4_layoutcommit_release(void *calldata) struct nfs4_layoutcommit_data *data = calldata; pnfs_cleanup_layoutcommit(data); + nfs_post_op_update_inode_force_wcc(data->args.inode, + data->res.fattr); put_rpccred(data->cred); kfree(data); } diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index a4f4181..0237939 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -359,6 +359,15 @@ pnfs_ld_layoutret_on_setattr(struct inode *inode) PNFS_LAYOUTRET_ON_SETATTR; } +static inline bool +pnfs_layoutcommit_outstanding(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + return test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags) != 0 || + test_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags) != 0; +} + static inline int pnfs_return_layout(struct inode *ino) { struct nfs_inode *nfsi = NFS_I(ino); @@ -515,6 +524,13 @@ pnfs_use_threshold(struct nfs4_threshold **dst, struct nfs4_threshold *src, return false; } +static inline bool +pnfs_layoutcommit_outstanding(struct inode *inode) +{ + return false; +} + + static inline struct nfs4_threshold *pnfs_mdsthreshold_alloc(void) { return NULL; -- cgit v1.1 From 71244d9bdf185e5bba1473254241f9f65d4dd0d8 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 13 Jan 2014 13:34:36 -0500 Subject: NFSv4.1: Fix a race in nfs4_write_inode nfs4_write_inode() must not be allowed to exit until the layoutcommit is done. That means that both NFS_INO_LAYOUTCOMMIT and NFS_INO_LAYOUTCOMMITTING have to be cleared. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4super.c | 14 +++--------- fs/nfs/pnfs.c | 67 ++++++++++++++++++++++++++++-------------------------- 2 files changed, 38 insertions(+), 43 deletions(-) (limited to 'fs/nfs') diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c index 65ab0a0..808f295 100644 --- a/fs/nfs/nfs4super.c +++ b/fs/nfs/nfs4super.c @@ -77,17 +77,9 @@ static int nfs4_write_inode(struct inode *inode, struct writeback_control *wbc) { int ret = nfs_write_inode(inode, wbc); - if (ret >= 0 && test_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(inode)->flags)) { - int status; - bool sync = true; - - if (wbc->sync_mode == WB_SYNC_NONE) - sync = false; - - status = pnfs_layoutcommit_inode(inode, sync); - if (status < 0) - return status; - } + if (ret == 0) + ret = pnfs_layoutcommit_inode(inode, + wbc->sync_mode == WB_SYNC_ALL); return ret; } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index d75d938..4755858 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1790,6 +1790,15 @@ pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) } EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages); +static void pnfs_clear_layoutcommitting(struct inode *inode) +{ + unsigned long *bitlock = &NFS_I(inode)->flags; + + clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock); + smp_mb__after_clear_bit(); + wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING); +} + /* * There can be multiple RW segments. */ @@ -1807,7 +1816,6 @@ static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp) static void pnfs_list_write_lseg_done(struct inode *inode, struct list_head *listp) { struct pnfs_layout_segment *lseg, *tmp; - unsigned long *bitlock = &NFS_I(inode)->flags; /* Matched by references in pnfs_set_layoutcommit */ list_for_each_entry_safe(lseg, tmp, listp, pls_lc_list) { @@ -1815,9 +1823,7 @@ static void pnfs_list_write_lseg_done(struct inode *inode, struct list_head *lis pnfs_put_lseg(lseg); } - clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock); - smp_mb__after_clear_bit(); - wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING); + pnfs_clear_layoutcommitting(inode); } void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg) @@ -1881,43 +1887,37 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) struct nfs4_layoutcommit_data *data; struct nfs_inode *nfsi = NFS_I(inode); loff_t end_pos; - int status = 0; + int status; - dprintk("--> %s inode %lu\n", __func__, inode->i_ino); - - if (!test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) + if (!pnfs_layoutcommit_outstanding(inode)) return 0; - /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */ - data = kzalloc(sizeof(*data), GFP_NOFS); - if (!data) { - status = -ENOMEM; - goto out; - } - - if (!test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) - goto out_free; + dprintk("--> %s inode %lu\n", __func__, inode->i_ino); + status = -EAGAIN; if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) { - if (!sync) { - status = -EAGAIN; - goto out_free; - } - status = wait_on_bit_lock(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING, - nfs_wait_bit_killable, TASK_KILLABLE); + if (!sync) + goto out; + status = wait_on_bit_lock(&nfsi->flags, + NFS_INO_LAYOUTCOMMITTING, + nfs_wait_bit_killable, + TASK_KILLABLE); if (status) - goto out_free; + goto out; } - INIT_LIST_HEAD(&data->lseg_list); + status = -ENOMEM; + /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */ + data = kzalloc(sizeof(*data), GFP_NOFS); + if (!data) + goto clear_layoutcommitting; + + status = 0; spin_lock(&inode->i_lock); - if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { - clear_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags); - spin_unlock(&inode->i_lock); - wake_up_bit(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING); - goto out_free; - } + if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) + goto out_unlock; + INIT_LIST_HEAD(&data->lseg_list); pnfs_list_write_lseg(inode, &data->lseg_list); end_pos = nfsi->layout->plh_lwb; @@ -1940,8 +1940,11 @@ out: mark_inode_dirty_sync(inode); dprintk("<-- %s status %d\n", __func__, status); return status; -out_free: +out_unlock: + spin_unlock(&inode->i_lock); kfree(data); +clear_layoutcommitting: + pnfs_clear_layoutcommitting(inode); goto out; } -- cgit v1.1 From 78b19bae0813bd6f921ca58490196abd101297bd Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Mon, 13 Jan 2014 16:54:45 -0500 Subject: nfs4.1: properly handle ENOTSUP in SECINFO_NO_NAME Don't check for -NFS4ERR_NOTSUPP, it's already been mapped to -ENOTSUPP by nfs4_stat_to_errno. This allows the client to mount v4.1 servers that don't support SECINFO_NO_NAME by falling back to the "guess and check" method of nfs4_find_root_sec. Signed-off-by: Weston Andros Adamson Cc: stable@vger.kernel.org # 3.1+ Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/nfs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index f4908eb..a9eaaaa 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -7919,7 +7919,7 @@ nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle, switch (err) { case 0: case -NFS4ERR_WRONGSEC: - case -NFS4ERR_NOTSUPP: + case -ENOTSUPP: goto out; default: err = nfs4_handle_exception(server, err, &exception); @@ -7953,7 +7953,7 @@ nfs41_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle, * Fall back on "guess and check" method if * the server doesn't support SECINFO_NO_NAME */ - if (err == -NFS4ERR_WRONGSEC || err == -NFS4ERR_NOTSUPP) { + if (err == -NFS4ERR_WRONGSEC || err == -ENOTSUPP) { err = nfs4_find_root_sec(server, fhandle, info); goto out_freepage; } -- cgit v1.1 From 9811cd57f4c6b5b60ec104de68a88303717e3106 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 14 Nov 2013 08:50:28 -0800 Subject: nfs: fix size updates for aio writes nfs_file_direct_write only updates the inode size if it succeeded and returned the number of bytes written. But in the AIO case nfs_direct_wait turns the return value into -EIOCBQUEUED and we skip the size update. Instead the aio completion path should updated it, which this patch does. The implementation is a little hacky because there is no obvious way to find out we are called for a write in nfs_direct_complete. Signed-off-by: Christoph Hellwig Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'fs/nfs') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 049b340..ce52a97 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -222,12 +222,23 @@ out: * Synchronous I/O uses a stack-allocated iocb. Thus we can't trust * the iocb is still valid here if this is a synchronous request. */ -static void nfs_direct_complete(struct nfs_direct_req *dreq) +static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write) { + struct inode *inode = dreq->inode; + if (dreq->iocb) { + loff_t pos = dreq->iocb->ki_pos + dreq->count; long res = (long) dreq->error; if (!res) res = (long) dreq->count; + + if (write) { + spin_lock(&inode->i_lock); + if (i_size_read(inode) < pos) + i_size_write(inode, pos); + spin_unlock(&inode->i_lock); + } + aio_complete(dreq->iocb, res, 0); } complete_all(&dreq->completion); @@ -272,7 +283,7 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr) } out_put: if (put_dreq(dreq)) - nfs_direct_complete(dreq); + nfs_direct_complete(dreq, false); hdr->release(hdr); } @@ -434,7 +445,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, } if (put_dreq(dreq)) - nfs_direct_complete(dreq); + nfs_direct_complete(dreq, false); return 0; } @@ -594,7 +605,7 @@ static void nfs_direct_write_schedule_work(struct work_struct *work) break; default: nfs_inode_dio_write_done(dreq->inode); - nfs_direct_complete(dreq); + nfs_direct_complete(dreq, true); } } @@ -611,7 +622,7 @@ static void nfs_direct_write_schedule_work(struct work_struct *work) static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode) { nfs_inode_dio_write_done(inode); - nfs_direct_complete(dreq); + nfs_direct_complete(dreq, true); } #endif -- cgit v1.1 From 2a009ec98cce440c0992fc9a2353e96cdb0b048b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 14 Nov 2013 08:50:29 -0800 Subject: nfs: defer inode_dio_done call until size update is done We need to have the I/O fully finished before telling the truncate code that we are done. Signed-off-by: Christoph Hellwig Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) (limited to 'fs/nfs') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index ce52a97..75ed2a9 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -226,21 +226,27 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write) { struct inode *inode = dreq->inode; - if (dreq->iocb) { + if (dreq->iocb && write) { loff_t pos = dreq->iocb->ki_pos + dreq->count; + + spin_lock(&inode->i_lock); + if (i_size_read(inode) < pos) + i_size_write(inode, pos); + spin_unlock(&inode->i_lock); + } + + if (write) { + nfs_zap_mapping(inode, inode->i_mapping); + inode_dio_done(inode); + } + + if (dreq->iocb) { long res = (long) dreq->error; if (!res) res = (long) dreq->count; - - if (write) { - spin_lock(&inode->i_lock); - if (i_size_read(inode) < pos) - i_size_write(inode, pos); - spin_unlock(&inode->i_lock); - } - aio_complete(dreq->iocb, res, 0); } + complete_all(&dreq->completion); nfs_direct_req_release(dreq); @@ -483,12 +489,6 @@ out: return result; } -static void nfs_inode_dio_write_done(struct inode *inode) -{ - nfs_zap_mapping(inode, inode->i_mapping); - inode_dio_done(inode); -} - #if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) { @@ -604,7 +604,6 @@ static void nfs_direct_write_schedule_work(struct work_struct *work) nfs_direct_write_reschedule(dreq); break; default: - nfs_inode_dio_write_done(dreq->inode); nfs_direct_complete(dreq, true); } } @@ -621,7 +620,6 @@ static void nfs_direct_write_schedule_work(struct work_struct *work) static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode) { - nfs_inode_dio_write_done(inode); nfs_direct_complete(dreq, true); } #endif -- cgit v1.1 From 1f90ee27461e31a1c18e5d819f6ea6f5c7304b16 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 14 Nov 2013 08:50:30 -0800 Subject: nfs: increment i_dio_count for reads, too i_dio_count is used to protect dio access against truncate. We want to make sure there are no dio reads pending either when doing a truncate. I suspect on plain NFS things might work even without this, but once we use a pnfs layout driver that access backing devices directly things will go bad without the proper synchronization. Signed-off-by: Christoph Hellwig Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'fs/nfs') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 75ed2a9..6c232107 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -235,10 +235,10 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write) spin_unlock(&inode->i_lock); } - if (write) { + if (write) nfs_zap_mapping(inode, inode->i_mapping); - inode_dio_done(inode); - } + + inode_dio_done(inode); if (dreq->iocb) { long res = (long) dreq->error; @@ -419,6 +419,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, loff_t pos, bool uio) { struct nfs_pageio_descriptor desc; + struct inode *inode = dreq->inode; ssize_t result = -EINVAL; size_t requested_bytes = 0; unsigned long seg; @@ -427,6 +428,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, &nfs_direct_read_completion_ops); get_dreq(dreq); desc.pg_dreq = dreq; + atomic_inc(&inode->i_dio_count); for (seg = 0; seg < nr_segs; seg++) { const struct iovec *vec = &iov[seg]; @@ -446,6 +448,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, * generic layer handle the completion. */ if (requested_bytes == 0) { + inode_dio_done(inode); nfs_direct_req_release(dreq); return result < 0 ? result : -EIO; } -- cgit v1.1 From 14a3ec79437252d922bae574ecbf0c0584c330f3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 14 Nov 2013 08:50:31 -0800 Subject: nfs: merge nfs_direct_read into nfs_file_direct_read Simple code cleanup to prepare for later fixes. Signed-off-by: Christoph Hellwig Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 107 ++++++++++++++++++++++++++------------------------------ 1 file changed, 49 insertions(+), 58 deletions(-) (limited to 'fs/nfs') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 6c232107..f8b0a81 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -458,14 +458,55 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, return 0; } -static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos, bool uio) +/** + * nfs_file_direct_read - file direct read operation for NFS files + * @iocb: target I/O control block + * @iov: vector of user buffers into which to read data + * @nr_segs: size of iov vector + * @pos: byte offset in file where reading starts + * + * We use this function for direct reads instead of calling + * generic_file_aio_read() in order to avoid gfar's check to see if + * the request starts before the end of the file. For that check + * to work, we must generate a GETATTR before each direct read, and + * even then there is a window between the GETATTR and the subsequent + * READ where the file size could change. Our preference is simply + * to do all reads the application wants, and the server will take + * care of managing the end of file boundary. + * + * This function also eliminates unnecessarily updating the file's + * atime locally, as the NFS server sets the file's atime, and this + * client must read the updated atime from the server back into its + * cache. + */ +ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos, bool uio) { - ssize_t result = -ENOMEM; - struct inode *inode = iocb->ki_filp->f_mapping->host; + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; struct nfs_direct_req *dreq; struct nfs_lock_context *l_ctx; + ssize_t result = -EINVAL; + size_t count; + + count = iov_length(iov, nr_segs); + nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count); + + dfprintk(FILE, "NFS: direct read(%pD2, %zd@%Ld)\n", + file, count, (long long) pos); + + result = 0; + if (!count) + goto out; + + result = nfs_sync_mapping(mapping); + if (result) + goto out; + + task_io_account_read(count); + result = -ENOMEM; dreq = nfs_direct_req_alloc(); if (dreq == NULL) goto out; @@ -484,8 +525,11 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov, NFS_I(inode)->read_io += iov_length(iov, nr_segs); result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos, uio); - if (!result) + if (!result) { result = nfs_direct_wait(dreq); + if (result > 0) + iocb->ki_pos = pos + result; + } out_release: nfs_direct_req_release(dreq); out: @@ -889,59 +933,6 @@ out: } /** - * nfs_file_direct_read - file direct read operation for NFS files - * @iocb: target I/O control block - * @iov: vector of user buffers into which to read data - * @nr_segs: size of iov vector - * @pos: byte offset in file where reading starts - * - * We use this function for direct reads instead of calling - * generic_file_aio_read() in order to avoid gfar's check to see if - * the request starts before the end of the file. For that check - * to work, we must generate a GETATTR before each direct read, and - * even then there is a window between the GETATTR and the subsequent - * READ where the file size could change. Our preference is simply - * to do all reads the application wants, and the server will take - * care of managing the end of file boundary. - * - * This function also eliminates unnecessarily updating the file's - * atime locally, as the NFS server sets the file's atime, and this - * client must read the updated atime from the server back into its - * cache. - */ -ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos, bool uio) -{ - ssize_t retval = -EINVAL; - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - size_t count; - - count = iov_length(iov, nr_segs); - nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count); - - dfprintk(FILE, "NFS: direct read(%pD2, %zd@%Ld)\n", - file, count, (long long) pos); - - retval = 0; - if (!count) - goto out; - - retval = nfs_sync_mapping(mapping); - if (retval) - goto out; - - task_io_account_read(count); - - retval = nfs_direct_read(iocb, iov, nr_segs, pos, uio); - if (retval > 0) - iocb->ki_pos = pos + retval; - -out: - return retval; -} - -/** * nfs_file_direct_write - file direct write operation for NFS files * @iocb: target I/O control block * @iov: vector of user buffers from which to write data -- cgit v1.1 From 22cd1bf1480133518b3e5568466759ffde649b35 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 14 Nov 2013 08:50:32 -0800 Subject: nfs: merge nfs_direct_write into nfs_file_direct_write Simple code cleanup to prepare for later fixes. Signed-off-by: Christoph Hellwig Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 91 ++++++++++++++++++++++++++------------------------------- 1 file changed, 41 insertions(+), 50 deletions(-) (limited to 'fs/nfs') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index f8b0a81..cbfbd17 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -898,40 +898,6 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, return 0; } -static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos, - size_t count, bool uio) -{ - ssize_t result = -ENOMEM; - struct inode *inode = iocb->ki_filp->f_mapping->host; - struct nfs_direct_req *dreq; - struct nfs_lock_context *l_ctx; - - dreq = nfs_direct_req_alloc(); - if (!dreq) - goto out; - - dreq->inode = inode; - dreq->bytes_left = count; - dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); - l_ctx = nfs_get_lock_context(dreq->ctx); - if (IS_ERR(l_ctx)) { - result = PTR_ERR(l_ctx); - goto out_release; - } - dreq->l_ctx = l_ctx; - if (!is_sync_kiocb(iocb)) - dreq->iocb = iocb; - - result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, uio); - if (!result) - result = nfs_direct_wait(dreq); -out_release: - nfs_direct_req_release(dreq); -out: - return result; -} - /** * nfs_file_direct_write - file direct write operation for NFS files * @iocb: target I/O control block @@ -957,9 +923,12 @@ out: ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos, bool uio) { - ssize_t retval = -EINVAL; + ssize_t result = -EINVAL; struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + struct nfs_direct_req *dreq; + struct nfs_lock_context *l_ctx; size_t count; count = iov_length(iov, nr_segs); @@ -968,35 +937,57 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov, dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n", file, count, (long long) pos); - retval = generic_write_checks(file, &pos, &count, 0); - if (retval) + result = generic_write_checks(file, &pos, &count, 0); + if (result) goto out; - retval = -EINVAL; + result = -EINVAL; if ((ssize_t) count < 0) goto out; - retval = 0; + result = 0; if (!count) goto out; - retval = nfs_sync_mapping(mapping); - if (retval) + result = nfs_sync_mapping(mapping); + if (result) goto out; task_io_account_write(count); - retval = nfs_direct_write(iocb, iov, nr_segs, pos, count, uio); - if (retval > 0) { - struct inode *inode = mapping->host; + result = -ENOMEM; + dreq = nfs_direct_req_alloc(); + if (!dreq) + goto out; - iocb->ki_pos = pos + retval; - spin_lock(&inode->i_lock); - if (i_size_read(inode) < iocb->ki_pos) - i_size_write(inode, iocb->ki_pos); - spin_unlock(&inode->i_lock); + dreq->inode = inode; + dreq->bytes_left = count; + dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); + l_ctx = nfs_get_lock_context(dreq->ctx); + if (IS_ERR(l_ctx)) { + result = PTR_ERR(l_ctx); + goto out_release; + } + dreq->l_ctx = l_ctx; + if (!is_sync_kiocb(iocb)) + dreq->iocb = iocb; + + result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, uio); + if (!result) { + result = nfs_direct_wait(dreq); + if (result > 0) { + struct inode *inode = mapping->host; + + iocb->ki_pos = pos + result; + spin_lock(&inode->i_lock); + if (i_size_read(inode) < iocb->ki_pos) + i_size_write(inode, iocb->ki_pos); + spin_unlock(&inode->i_lock); + } } +out_release: + nfs_direct_req_release(dreq); out: - return retval; + return result; } /** -- cgit v1.1 From d0b9875d65c1abcc9d405d648660dfb919353959 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 14 Nov 2013 08:50:33 -0800 Subject: nfs: take i_mutex during direct I/O reads We'll need the i_mutex to prevent i_dio_count from incrementing while truncate is waiting for it to reach zero, and protects against having the pagecache repopulated after we flushed it. Signed-off-by: Christoph Hellwig Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'fs/nfs') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index cbfbd17..85e4e4b 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -500,16 +500,17 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov, if (!count) goto out; + mutex_lock(&inode->i_mutex); result = nfs_sync_mapping(mapping); if (result) - goto out; + goto out_unlock; task_io_account_read(count); result = -ENOMEM; dreq = nfs_direct_req_alloc(); if (dreq == NULL) - goto out; + goto out_unlock; dreq->inode = inode; dreq->bytes_left = iov_length(iov, nr_segs); @@ -525,13 +526,22 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov, NFS_I(inode)->read_io += iov_length(iov, nr_segs); result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos, uio); + + mutex_unlock(&inode->i_mutex); + if (!result) { result = nfs_direct_wait(dreq); if (result > 0) iocb->ki_pos = pos + result; } + + nfs_direct_req_release(dreq); + return result; + out_release: nfs_direct_req_release(dreq); +out_unlock: + mutex_unlock(&inode->i_mutex); out: return result; } -- cgit v1.1 From a9ab5e840669b19aca2974e2c771a77df2876434 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 14 Nov 2013 08:50:34 -0800 Subject: nfs: page cache invalidation for dio Make sure to properly invalidate the pagecache before performing direct I/O, so that no stale pages are left around. This matches what the generic direct I/O code does. Also take the i_mutex over the direct write submission to avoid the lifelock vs truncate waiting for i_dio_count to decrease, and to avoid having the pagecache easily repopulated while direct I/O is in progrss. Again matching the generic direct I/O code. Signed-off-by: Christoph Hellwig Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) (limited to 'fs/nfs') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 85e4e4b..b8797ae 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -939,9 +939,12 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov, struct inode *inode = mapping->host; struct nfs_direct_req *dreq; struct nfs_lock_context *l_ctx; + loff_t end; size_t count; count = iov_length(iov, nr_segs); + end = (pos + count - 1) >> PAGE_CACHE_SHIFT; + nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count); dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n", @@ -958,16 +961,25 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov, if (!count) goto out; + mutex_lock(&inode->i_mutex); + result = nfs_sync_mapping(mapping); if (result) - goto out; + goto out_unlock; + + if (mapping->nrpages) { + result = invalidate_inode_pages2_range(mapping, + pos >> PAGE_CACHE_SHIFT, end); + if (result) + goto out_unlock; + } task_io_account_write(count); result = -ENOMEM; dreq = nfs_direct_req_alloc(); if (!dreq) - goto out; + goto out_unlock; dreq->inode = inode; dreq->bytes_left = count; @@ -982,6 +994,14 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov, dreq->iocb = iocb; result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, uio); + + if (mapping->nrpages) { + invalidate_inode_pages2_range(mapping, + pos >> PAGE_CACHE_SHIFT, end); + } + + mutex_unlock(&inode->i_mutex); + if (!result) { result = nfs_direct_wait(dreq); if (result > 0) { @@ -994,8 +1014,13 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov, spin_unlock(&inode->i_lock); } } + nfs_direct_req_release(dreq); + return result; + out_release: nfs_direct_req_release(dreq); +out_unlock: + mutex_unlock(&inode->i_mutex); out: return result; } -- cgit v1.1 From 263b4509ec4d47e0da3e753f85a39ea12d1eff24 Mon Sep 17 00:00:00 2001 From: Scott Mayhew Date: Fri, 17 Jan 2014 15:12:05 -0500 Subject: nfs: always make sure page is up-to-date before extending a write to cover the entire page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We should always make sure the cached page is up-to-date when we're determining whether we can extend a write to cover the full page -- even if we've received a write delegation from the server. Commit c7559663 added logic to skip this check if we have a write delegation, which can lead to data corruption such as the following scenario if client B receives a write delegation from the NFS server: Client A: # echo 123456789 > /mnt/file Client B: # echo abcdefghi >> /mnt/file # cat /mnt/file 0�D0�abcdefghi Just because we hold a write delegation doesn't mean that we've read in the entire page contents. Cc: # v3.11+ Signed-off-by: Scott Mayhew Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'fs/nfs') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 77a00c6..a44a872 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -922,19 +922,20 @@ out: * extend the write to cover the entire page in order to avoid fragmentation * inefficiencies. * - * If the file is opened for synchronous writes or if we have a write delegation - * from the server then we can just skip the rest of the checks. + * If the file is opened for synchronous writes then we can just skip the rest + * of the checks. */ static int nfs_can_extend_write(struct file *file, struct page *page, struct inode *inode) { if (file->f_flags & O_DSYNC) return 0; + if (!nfs_write_pageuptodate(page, inode)) + return 0; if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE)) return 1; - if (nfs_write_pageuptodate(page, inode) && (inode->i_flock == NULL || - (inode->i_flock->fl_start == 0 && + if (inode->i_flock == NULL || (inode->i_flock->fl_start == 0 && inode->i_flock->fl_end == OFFSET_MAX && - inode->i_flock->fl_type != F_RDLCK))) + inode->i_flock->fl_type != F_RDLCK)) return 1; return 0; } -- cgit v1.1 From 64590daa9e0dfb3aad89e3ab9230683b76211d5b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 17 Jan 2014 17:03:41 -0500 Subject: NFSv4.1: Handle errors correctly in nfs41_walk_client_list Both nfs41_walk_client_list and nfs40_walk_client_list expect the 'status' variable to be set to the value -NFS4ERR_STALE_CLIENTID if the loop fails to find a match. The problem is that the 'pos->cl_cons_state > NFS_CS_READY' changes the value of 'status', and sets it either to the value '0' (which indicates success), or to the value EINTR. Cc: stable@vger.kernel.org # 3.7.x: 7b1f1fd1842e6: NFSv4/4.1: Fix bugs in Signed-off-by: Trond Myklebust --- fs/nfs/nfs4client.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs/nfs') diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index c1b7a80..06e770a 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -498,9 +498,10 @@ int nfs40_walk_client_list(struct nfs_client *new, prev = pos; status = nfs_wait_client_init_complete(pos); - spin_lock(&nn->nfs_client_lock); if (status < 0) - continue; + goto out; + status = -NFS4ERR_STALE_CLIENTID; + spin_lock(&nn->nfs_client_lock); } if (pos->cl_cons_state != NFS_CS_READY) continue; @@ -638,7 +639,8 @@ int nfs41_walk_client_list(struct nfs_client *new, } spin_lock(&nn->nfs_client_lock); if (status < 0) - continue; + break; + status = -NFS4ERR_STALE_CLIENTID; } if (pos->cl_cons_state != NFS_CS_READY) continue; -- cgit v1.1 From abad2fa5ba67725a3f9c376c8cfe76fbe94a3041 Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Sun, 19 Jan 2014 22:45:36 -0500 Subject: nfs4: fix discover_server_trunking use after free If clp is new (cl_count = 1) and it matches another client in nfs4_discover_server_trunking, the nfs_put_client will free clp before ->cl_preserve_clid is set. Cc: stable@vger.kernel.org # 3.7+ Signed-off-by: Weston Andros Adamson Signed-off-by: Trond Myklebust --- fs/nfs/nfs4client.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'fs/nfs') diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 06e770a..73d4ecd 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -414,13 +414,11 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp, error = nfs4_discover_server_trunking(clp, &old); if (error < 0) goto error; - nfs_put_client(clp); - if (clp != old) { - clp->cl_preserve_clid = true; - clp = old; - } - return clp; + if (clp != old) + clp->cl_preserve_clid = true; + nfs_put_client(clp); + return old; error: nfs_mark_client_ready(clp, error); -- cgit v1.1 From 471252cd8b34b0609973740b25dcd1ff01dc1889 Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Tue, 21 Jan 2014 15:21:33 -0500 Subject: pnfs: fix BUG in filelayout_recover_commit_reqs cond_resched_lock(cinfo->lock) is called everywhere else while holding the cinfo->lock spinlock. Not holding this lock while calling transfer_commit_list in filelayout_recover_commit_reqs causes the BUG below. It's true that we can't hold this lock while calling pnfs_put_lseg, because that might try to lock the inode lock - which might be the same lock as cinfo->lock. To reproduce, mount a 2 DS pynfs server and run an O_DIRECT command that crosses a stripe boundary and is not page aligned, such as: dd if=/dev/zero of=/mnt/f bs=17000 count=1 oflag=direct BUG: sleeping function called from invalid context at linux/fs/nfs/nfs4filelayout.c:1161 in_atomic(): 0, irqs_disabled(): 0, pid: 27, name: kworker/0:1 2 locks held by kworker/0:1/27: #0: (events){.+.+.+}, at: [] process_one_work+0x175/0x3a5 #1: ((&dreq->work)){+.+...}, at: [] process_one_work+0x175/0x3a5 CPU: 0 PID: 27 Comm: kworker/0:1 Not tainted 3.13.0-rc3-branch-dros_testing+ #21 Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/31/2013 Workqueue: events nfs_direct_write_schedule_work [nfs] 0000000000000000 ffff88007a39bbb8 ffffffff81491256 ffff88007b87a130 ffff88007a39bbd8 ffffffff8105f103 ffff880079614000 ffff880079617d40 ffff88007a39bc20 ffffffffa011603e ffff880078988b98 0000000000000000 Call Trace: [] dump_stack+0x4d/0x66 [] __might_sleep+0x100/0x105 [] transfer_commit_list+0x94/0xf1 [nfs_layout_nfsv41_files] [] filelayout_recover_commit_reqs+0x3b/0x68 [nfs_layout_nfsv41_files] [] nfs_direct_write_reschedule+0x9f/0x1d6 [nfs] [] ? mark_lock+0x1df/0x224 [] ? trace_hardirqs_off_caller+0x37/0xa4 [] ? trace_hardirqs_off+0xd/0xf [] nfs_direct_write_schedule_work+0x9d/0xb7 [nfs] [] ? process_one_work+0x175/0x3a5 [] process_one_work+0x1f6/0x3a5 [] ? process_one_work+0x175/0x3a5 [] worker_thread+0x149/0x1f5 [] ? rescuer_thread+0x28d/0x28d [] kthread+0xd2/0xda [] ? __kthread_parkme+0x61/0x61 [] ret_from_fork+0x7c/0xb0 [] ? __kthread_parkme+0x61/0x61 Signed-off-by: Weston Andros Adamson Signed-off-by: Trond Myklebust --- fs/nfs/nfs4filelayout.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/nfs') diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 0a93e79..03fd8be 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -1216,17 +1216,17 @@ static void filelayout_recover_commit_reqs(struct list_head *dst, struct pnfs_commit_bucket *b; int i; - /* NOTE cinfo->lock is NOT held, relying on fact that this is - * only called on single thread per dreq. - * Can't take the lock because need to do pnfs_put_lseg - */ + spin_lock(cinfo->lock); for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) { if (transfer_commit_list(&b->written, dst, cinfo, 0)) { + spin_unlock(cinfo->lock); pnfs_put_lseg(b->wlseg); b->wlseg = NULL; + spin_lock(cinfo->lock); } } cinfo->ds->nwritten = 0; + spin_unlock(cinfo->lock); } static unsigned int -- cgit v1.1 From ed7e5423014ad89720fcf315c0b73f2c5d0c7bd2 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Wed, 22 Jan 2014 20:34:54 +0200 Subject: pnfs: Proper delay for NFS4ERR_RECALLCONFLICT in layout_get_done An NFS4ERR_RECALLCONFLICT is returned by server from a GET_LAYOUT only when a Server Sent a RECALL do to that GET_LAYOUT, or the RECALL and GET_LAYOUT crossed on the wire. In any way this means we want to wait at most until in-flight IO is finished and the RECALL can be satisfied. So a proper wait here is more like 1/10 of a second, not 15 seconds like we have now. In case of a server bug we delay exponentially longer on each retry. Current code totally craps out performance of very large files on most pnfs-objects layouts, because of how the map changes when the file has grown into the next raid group. [Stable: This will patch back to 3.9. If there are earlier still maintained trees, please tell me I'll send a patch] CC: Stable Tree Signed-off-by: Boaz Harrosh Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) (limited to 'fs/nfs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index a9eaaaa..a196532 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -7409,9 +7409,9 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) struct nfs_server *server = NFS_SERVER(inode); struct pnfs_layout_hdr *lo; struct nfs4_state *state = NULL; - unsigned long timeo, giveup; + unsigned long timeo, now, giveup; - dprintk("--> %s\n", __func__); + dprintk("--> %s tk_status => %d\n", __func__, -task->tk_status); if (!nfs41_sequence_done(task, &lgp->res.seq_res)) goto out; @@ -7419,12 +7419,38 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) switch (task->tk_status) { case 0: goto out; + /* + * NFS4ERR_LAYOUTTRYLATER is a conflict with another client + * (or clients) writing to the same RAID stripe + */ case -NFS4ERR_LAYOUTTRYLATER: + /* + * NFS4ERR_RECALLCONFLICT is when conflict with self (must recall + * existing layout before getting a new one). + */ case -NFS4ERR_RECALLCONFLICT: timeo = rpc_get_timeout(task->tk_client); giveup = lgp->args.timestamp + timeo; - if (time_after(giveup, jiffies)) - task->tk_status = -NFS4ERR_DELAY; + now = jiffies; + if (time_after(giveup, now)) { + unsigned long delay; + + /* Delay for: + * - Not less then NFS4_POLL_RETRY_MIN. + * - One last time a jiffie before we give up + * - exponential backoff (time_now minus start_attempt) + */ + delay = max_t(unsigned long, NFS4_POLL_RETRY_MIN, + min((giveup - now - 1), + now - lgp->args.timestamp)); + + dprintk("%s: NFS4ERR_RECALLCONFLICT waiting %lu\n", + __func__, delay); + rpc_delay(task, delay); + task->tk_status = 0; + rpc_restart_call_prepare(task); + goto out; /* Do not call nfs4_async_handle_error() */ + } break; case -NFS4ERR_EXPIRED: case -NFS4ERR_BAD_STATEID: -- cgit v1.1