diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-04-26 17:33:59 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-04-26 17:33:59 -0700 |
commit | 59953fba87e5e535657403cc6439d24187929559 (patch) | |
tree | 4f92cc3bcacf052cb3fb895512af5a7d3dad86cb | |
parent | 9ec3a646fe09970f801ab15e0f1694060b9f19af (diff) | |
parent | f139b6c676c7e49b66016b28bf3f8ec5c54be891 (diff) | |
download | op-kernel-dev-59953fba87e5e535657403cc6439d24187929559.zip op-kernel-dev-59953fba87e5e535657403cc6439d24187929559.tar.gz |
Merge tag 'nfs-for-4.1-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs
Pull NFS client updates from Trond Myklebust:
"Another set of mainly bugfixes and a couple of cleanups. No new
functionality in this round.
Highlights include:
Stable patches:
- Fix a regression in /proc/self/mountstats
- Fix the pNFS flexfiles O_DIRECT support
- Fix high load average due to callback thread sleeping
Bugfixes:
- Various patches to fix the pNFS layoutcommit support
- Do not cache pNFS deviceids unless server notifications are enabled
- Fix a SUNRPC transport reconnection regression
- make debugfs file creation failure non-fatal in SUNRPC
- Another fix for circular directory warnings on NFSv4 "junctioned"
mountpoints
- Fix locking around NFSv4.2 fallocate() support
- Truncating NFSv4 file opens should also sync O_DIRECT writes
- Prevent infinite loop in rpcrdma_ep_create()
Features:
- Various improvements to the RDMA transport code's handling of
memory registration
- Various code cleanups"
* tag 'nfs-for-4.1-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs: (55 commits)
fs/nfs: fix new compiler warning about boolean in switch
nfs: Remove unneeded casts in nfs
NFS: Don't attempt to decode missing directory entries
Revert "nfs: replace nfs_add_stats with nfs_inc_stats when add one"
NFS: Rename idmap.c to nfs4idmap.c
NFS: Move nfs_idmap.h into fs/nfs/
NFS: Remove CONFIG_NFS_V4 checks from nfs_idmap.h
NFS: Add a stub for GETDEVICELIST
nfs: remove WARN_ON_ONCE from nfs_direct_good_bytes
nfs: fix DIO good bytes calculation
nfs: Fetch MOUNTED_ON_FILEID when updating an inode
sunrpc: make debugfs file creation failure non-fatal
nfs: fix high load average due to callback thread sleeping
NFS: Reduce time spent holding the i_mutex during fallocate()
NFS: Don't zap caches on fallocate()
xprtrdma: Make rpcrdma_{un}map_one() into inline functions
xprtrdma: Handle non-SEND completions via a callout
xprtrdma: Add "open" memreg op
xprtrdma: Add "destroy MRs" memreg op
xprtrdma: Add "reset MRs" memreg op
...
49 files changed, 1150 insertions, 914 deletions
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index 1e987ac..8664417 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -22,7 +22,7 @@ nfsv3-$(CONFIG_NFS_V3_ACL) += nfs3acl.o obj-$(CONFIG_NFS_V4) += nfsv4.o CFLAGS_nfs4trace.o += -I$(src) nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o \ - delegation.o idmap.o callback.o callback_xdr.o callback_proc.o \ + delegation.o nfs4idmap.o callback.o callback_xdr.o callback_proc.o \ nfs4namespace.o nfs4getroot.o nfs4client.o nfs4session.o \ dns_resolve.o nfs4trace.o nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 1cac3c1..d2554fe 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -890,6 +890,7 @@ static struct pnfs_layoutdriver_type blocklayout_type = { .free_deviceid_node = bl_free_deviceid_node, .pg_read_ops = &bl_pg_read_ops, .pg_write_ops = &bl_pg_write_ops, + .sync = pnfs_generic_sync, }; static int __init nfs4blocklayout_init(void) diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c index 5aed4f9..e535599 100644 --- a/fs/nfs/blocklayout/dev.c +++ b/fs/nfs/blocklayout/dev.c @@ -33,7 +33,7 @@ bl_free_deviceid_node(struct nfs4_deviceid_node *d) container_of(d, struct pnfs_block_dev, node); bl_free_device(dev); - kfree(dev); + kfree_rcu(dev, node.rcu); } static int diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 351be920..8d129bb 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -128,7 +128,7 @@ nfs41_callback_svc(void *vrqstp) if (try_to_freeze()) continue; - prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_UNINTERRUPTIBLE); + prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE); spin_lock_bh(&serv->sv_cb_lock); if (!list_empty(&serv->sv_cb_list)) { req = list_first_entry(&serv->sv_cb_list, @@ -142,10 +142,10 @@ nfs41_callback_svc(void *vrqstp) error); } else { spin_unlock_bh(&serv->sv_cb_lock); - /* schedule_timeout to game the hung task watchdog */ - schedule_timeout(60 * HZ); + schedule(); finish_wait(&serv->sv_cb_waitq, &wq); } + flush_signals(current); } return 0; } diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 1987415..892aeff 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -31,7 +31,6 @@ #include <linux/lockd/bind.h> #include <linux/seq_file.h> #include <linux/mount.h> -#include <linux/nfs_idmap.h> #include <linux/vfs.h> #include <linux/inet.h> #include <linux/in6.h> diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index a6ad688..029d688 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -378,7 +378,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct if (freeme == NULL) goto out; } - list_add_rcu(&delegation->super_list, &server->delegations); + list_add_tail_rcu(&delegation->super_list, &server->delegations); rcu_assign_pointer(nfsi->delegation, delegation); delegation = NULL; @@ -514,7 +514,7 @@ void nfs_inode_return_delegation_noreclaim(struct inode *inode) delegation = nfs_inode_detach_delegation(inode); if (delegation != NULL) - nfs_do_return_delegation(inode, delegation, 0); + nfs_do_return_delegation(inode, delegation, 1); } /** diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 1e51ecd..b2c8b31 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -543,6 +543,9 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en if (scratch == NULL) return -ENOMEM; + if (buflen == 0) + goto out_nopages; + xdr_init_decode_pages(&stream, &buf, xdr_pages, buflen); xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); @@ -564,6 +567,7 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en break; } while (!entry->eof); +out_nopages: if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) { array = nfs_readdir_get_array(page); if (!IS_ERR(array)) { diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index b2cbc3a..38678d9 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -129,22 +129,25 @@ nfs_direct_good_bytes(struct nfs_direct_req *dreq, struct nfs_pgio_header *hdr) int i; ssize_t count; - WARN_ON_ONCE(hdr->pgio_mirror_idx >= dreq->mirror_count); - - count = dreq->mirrors[hdr->pgio_mirror_idx].count; - if (count + dreq->io_start < hdr->io_start + hdr->good_bytes) { - count = hdr->io_start + hdr->good_bytes - dreq->io_start; - dreq->mirrors[hdr->pgio_mirror_idx].count = count; - } - - /* update the dreq->count by finding the minimum agreed count from all - * mirrors */ - count = dreq->mirrors[0].count; + if (dreq->mirror_count == 1) { + dreq->mirrors[hdr->pgio_mirror_idx].count += hdr->good_bytes; + dreq->count += hdr->good_bytes; + } else { + /* mirrored writes */ + count = dreq->mirrors[hdr->pgio_mirror_idx].count; + if (count + dreq->io_start < hdr->io_start + hdr->good_bytes) { + count = hdr->io_start + hdr->good_bytes - dreq->io_start; + dreq->mirrors[hdr->pgio_mirror_idx].count = count; + } + /* update the dreq->count by finding the minimum agreed count from all + * mirrors */ + count = dreq->mirrors[0].count; - for (i = 1; i < dreq->mirror_count; i++) - count = min(count, dreq->mirrors[i].count); + for (i = 1; i < dreq->mirror_count; i++) + count = min(count, dreq->mirrors[i].count); - dreq->count = count; + dreq->count = count; + } } /* @@ -258,18 +261,11 @@ ssize_t nfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t pos) if (!IS_SWAPFILE(inode)) return 0; -#ifndef CONFIG_NFS_SWAP - dprintk("NFS: nfs_direct_IO (%pD) off/no(%Ld/%lu) EINVAL\n", - iocb->ki_filp, (long long) pos, iter->nr_segs); - - return -EINVAL; -#else VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE); if (iov_iter_rw(iter) == READ) return nfs_file_direct_read(iocb, iter, pos); return nfs_file_direct_write(iocb, iter); -#endif /* CONFIG_NFS_SWAP */ } static void nfs_direct_release_pages(struct page **pages, unsigned int npages) @@ -1030,6 +1026,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) if (i_size_read(inode) < iocb->ki_pos) i_size_write(inode, iocb->ki_pos); spin_unlock(&inode->i_lock); + generic_write_sync(file, pos, result); } } nfs_direct_req_release(dreq); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index c40e436..8b8d83a 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -280,6 +280,7 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) trace_nfs_fsync_enter(inode); + nfs_inode_dio_wait(inode); do { ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret != 0) @@ -782,7 +783,7 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) * Flush all pending writes before doing anything * with locks.. */ - nfs_sync_mapping(filp->f_mapping); + vfs_fsync(filp, 0); l_ctx = nfs_get_lock_context(nfs_file_open_context(filp)); if (!IS_ERR(l_ctx)) { diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 91e88a7..a46bf6d 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -258,7 +258,8 @@ filelayout_set_layoutcommit(struct nfs_pgio_header *hdr) hdr->res.verf->committed != NFS_DATA_SYNC) return; - pnfs_set_layoutcommit(hdr); + pnfs_set_layoutcommit(hdr->inode, hdr->lseg, + hdr->mds_offset + hdr->res.count); dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb); } @@ -373,7 +374,7 @@ static int filelayout_commit_done_cb(struct rpc_task *task, } if (data->verf.committed == NFS_UNSTABLE) - pnfs_commit_set_layoutcommit(data); + pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb); return 0; } @@ -1086,7 +1087,7 @@ filelayout_alloc_deviceid_node(struct nfs_server *server, } static void -filelayout_free_deveiceid_node(struct nfs4_deviceid_node *d) +filelayout_free_deviceid_node(struct nfs4_deviceid_node *d) { nfs4_fl_free_deviceid(container_of(d, struct nfs4_file_layout_dsaddr, id_node)); } @@ -1137,7 +1138,8 @@ static struct pnfs_layoutdriver_type filelayout_type = { .read_pagelist = filelayout_read_pagelist, .write_pagelist = filelayout_write_pagelist, .alloc_deviceid_node = filelayout_alloc_deviceid_node, - .free_deviceid_node = filelayout_free_deveiceid_node, + .free_deviceid_node = filelayout_free_deviceid_node, + .sync = pnfs_nfs_generic_sync, }; static int __init nfs4filelayout_init(void) diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c index 4f372e2..4946ef4 100644 --- a/fs/nfs/filelayout/filelayoutdev.c +++ b/fs/nfs/filelayout/filelayoutdev.c @@ -55,7 +55,7 @@ nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) nfs4_pnfs_ds_put(ds); } kfree(dsaddr->stripe_indices); - kfree(dsaddr); + kfree_rcu(dsaddr, id_node.rcu); } /* Decode opaque device data and return the result */ diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 315cc68..7d05089 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -11,10 +11,10 @@ #include <linux/module.h> #include <linux/sunrpc/metrics.h> -#include <linux/nfs_idmap.h> #include "flexfilelayout.h" #include "../nfs4session.h" +#include "../nfs4idmap.h" #include "../internal.h" #include "../delegation.h" #include "../nfs4trace.h" @@ -891,7 +891,8 @@ static int ff_layout_read_done_cb(struct rpc_task *task, static void ff_layout_set_layoutcommit(struct nfs_pgio_header *hdr) { - pnfs_set_layoutcommit(hdr); + pnfs_set_layoutcommit(hdr->inode, hdr->lseg, + hdr->mds_offset + hdr->res.count); dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb); } @@ -1074,7 +1075,7 @@ static int ff_layout_commit_done_cb(struct rpc_task *task, } if (data->verf.committed == NFS_UNSTABLE) - pnfs_commit_set_layoutcommit(data); + pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb); return 0; } @@ -1414,7 +1415,7 @@ ff_layout_get_ds_info(struct inode *inode) } static void -ff_layout_free_deveiceid_node(struct nfs4_deviceid_node *d) +ff_layout_free_deviceid_node(struct nfs4_deviceid_node *d) { nfs4_ff_layout_free_deviceid(container_of(d, struct nfs4_ff_layout_ds, id_node)); @@ -1498,7 +1499,7 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = { .pg_read_ops = &ff_layout_pg_read_ops, .pg_write_ops = &ff_layout_pg_write_ops, .get_ds_info = ff_layout_get_ds_info, - .free_deviceid_node = ff_layout_free_deveiceid_node, + .free_deviceid_node = ff_layout_free_deviceid_node, .mark_request_commit = pnfs_layout_mark_request_commit, .clear_request_commit = pnfs_generic_clear_request_commit, .scan_commit_lists = pnfs_generic_scan_commit_lists, @@ -1508,6 +1509,7 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = { .write_pagelist = ff_layout_write_pagelist, .alloc_deviceid_node = ff_layout_alloc_deviceid_node, .encode_layoutreturn = ff_layout_encode_layoutreturn, + .sync = pnfs_nfs_generic_sync, }; static int __init nfs4flexfilelayout_init(void) diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index e2c01f2..77a2d02 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -30,7 +30,7 @@ void nfs4_ff_layout_free_deviceid(struct nfs4_ff_layout_ds *mirror_ds) { nfs4_print_deviceid(&mirror_ds->id_node.deviceid); nfs4_pnfs_ds_put(mirror_ds->ds); - kfree(mirror_ds); + kfree_rcu(mirror_ds, id_node.rcu); } /* Decode opaque device data and construct new_ds using it */ diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 3689e95..f734562 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -133,6 +133,13 @@ void nfs_evict_inode(struct inode *inode) nfs_clear_inode(inode); } +int nfs_sync_inode(struct inode *inode) +{ + nfs_inode_dio_wait(inode); + return nfs_wb_all(inode); +} +EXPORT_SYMBOL_GPL(nfs_sync_inode); + /** * nfs_sync_mapping - helper to flush all mmapped dirty data to disk */ @@ -192,7 +199,6 @@ void nfs_zap_caches(struct inode *inode) nfs_zap_caches_locked(inode); spin_unlock(&inode->i_lock); } -EXPORT_SYMBOL_GPL(nfs_zap_caches); void nfs_zap_mapping(struct inode *inode, struct address_space *mapping) { @@ -525,10 +531,8 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr) trace_nfs_setattr_enter(inode); /* Write all dirty data */ - if (S_ISREG(inode->i_mode)) { - nfs_inode_dio_wait(inode); - nfs_wb_all(inode); - } + if (S_ISREG(inode->i_mode)) + nfs_sync_inode(inode); fattr = nfs_alloc_fattr(); if (fattr == NULL) @@ -644,8 +648,9 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) trace_nfs_getattr_enter(inode); /* Flush out writes to the server in order to update c/mtime. */ if (S_ISREG(inode->i_mode)) { - nfs_inode_dio_wait(inode); - err = filemap_write_and_wait(inode->i_mapping); + mutex_lock(&inode->i_mutex); + err = nfs_sync_inode(inode); + mutex_unlock(&inode->i_mutex); if (err) goto out; } @@ -1588,6 +1593,19 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa } EXPORT_SYMBOL_GPL(nfs_post_op_update_inode_force_wcc); + +static inline bool nfs_fileid_valid(struct nfs_inode *nfsi, + struct nfs_fattr *fattr) +{ + bool ret1 = true, ret2 = true; + + if (fattr->valid & NFS_ATTR_FATTR_FILEID) + ret1 = (nfsi->fileid == fattr->fileid); + if (fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) + ret2 = (nfsi->fileid == fattr->mounted_on_fileid); + return ret1 || ret2; +} + /* * Many nfs protocol calls return the new file attributes after * an operation. Here we update the inode to reflect the state @@ -1614,7 +1632,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfs_display_fhandle_hash(NFS_FH(inode)), atomic_read(&inode->i_count), fattr->valid); - if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid) { + if (!nfs_fileid_valid(nfsi, fattr)) { printk(KERN_ERR "NFS: server %s error: fileid changed\n" "fsid %s: expected fileid 0x%Lx, got 0x%Lx\n", NFS_SERVER(inode)->nfs_client->cl_hostname, @@ -1819,7 +1837,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) struct inode *nfs_alloc_inode(struct super_block *sb) { struct nfs_inode *nfsi; - nfsi = (struct nfs_inode *)kmem_cache_alloc(nfs_inode_cachep, GFP_KERNEL); + nfsi = kmem_cache_alloc(nfs_inode_cachep, GFP_KERNEL); if (!nfsi) return NULL; nfsi->flags = 0UL; diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index cb17072..3a9e752 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -36,13 +36,16 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, loff_t offset, loff_t len) { struct inode *inode = file_inode(filep); + struct nfs_server *server = NFS_SERVER(inode); struct nfs42_falloc_args args = { .falloc_fh = NFS_FH(inode), .falloc_offset = offset, .falloc_length = len, + .falloc_bitmask = server->cache_consistency_bitmask, + }; + struct nfs42_falloc_res res = { + .falloc_server = server, }; - struct nfs42_falloc_res res; - struct nfs_server *server = NFS_SERVER(inode); int status; msg->rpc_argp = &args; @@ -52,8 +55,17 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, if (status) return status; - return nfs4_call_sync(server->client, server, msg, - &args.seq_args, &res.seq_res, 0); + res.falloc_fattr = nfs_alloc_fattr(); + if (!res.falloc_fattr) + return -ENOMEM; + + status = nfs4_call_sync(server->client, server, msg, + &args.seq_args, &res.seq_res, 0); + if (status == 0) + status = nfs_post_op_update_inode(inode, res.falloc_fattr); + + kfree(res.falloc_fattr); + return status; } static int nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, @@ -84,9 +96,13 @@ int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len) if (!nfs_server_capable(inode, NFS_CAP_ALLOCATE)) return -EOPNOTSUPP; + mutex_lock(&inode->i_mutex); + err = nfs42_proc_fallocate(&msg, filep, offset, len); if (err == -EOPNOTSUPP) NFS_SERVER(inode)->caps &= ~NFS_CAP_ALLOCATE; + + mutex_unlock(&inode->i_mutex); return err; } @@ -101,9 +117,16 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len) if (!nfs_server_capable(inode, NFS_CAP_DEALLOCATE)) return -EOPNOTSUPP; + nfs_wb_all(inode); + mutex_lock(&inode->i_mutex); + err = nfs42_proc_fallocate(&msg, filep, offset, len); + if (err == 0) + truncate_pagecache_range(inode, offset, (offset + len) -1); if (err == -EOPNOTSUPP) NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE; + + mutex_unlock(&inode->i_mutex); return err; } diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 038a7e15..1a25b27 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -25,16 +25,20 @@ #define NFS4_enc_allocate_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ - encode_allocate_maxsz) + encode_allocate_maxsz + \ + encode_getattr_maxsz) #define NFS4_dec_allocate_sz (compound_decode_hdr_maxsz + \ decode_putfh_maxsz + \ - decode_allocate_maxsz) + decode_allocate_maxsz + \ + decode_getattr_maxsz) #define NFS4_enc_deallocate_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ - encode_deallocate_maxsz) + encode_deallocate_maxsz + \ + encode_getattr_maxsz) #define NFS4_dec_deallocate_sz (compound_decode_hdr_maxsz + \ decode_putfh_maxsz + \ - decode_deallocate_maxsz) + decode_deallocate_maxsz + \ + decode_getattr_maxsz) #define NFS4_enc_seek_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ encode_seek_maxsz) @@ -92,6 +96,7 @@ static void nfs4_xdr_enc_allocate(struct rpc_rqst *req, encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->falloc_fh, &hdr); encode_allocate(xdr, args, &hdr); + encode_getfattr(xdr, args->falloc_bitmask, &hdr); encode_nops(&hdr); } @@ -110,6 +115,7 @@ static void nfs4_xdr_enc_deallocate(struct rpc_rqst *req, encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->falloc_fh, &hdr); encode_deallocate(xdr, args, &hdr); + encode_getfattr(xdr, args->falloc_bitmask, &hdr); encode_nops(&hdr); } @@ -183,6 +189,9 @@ static int nfs4_xdr_dec_allocate(struct rpc_rqst *rqstp, if (status) goto out; status = decode_allocate(xdr, res); + if (status) + goto out; + decode_getfattr(xdr, res->falloc_fattr, res->falloc_server); out: return status; } @@ -207,6 +216,9 @@ static int nfs4_xdr_dec_deallocate(struct rpc_rqst *rqstp, if (status) goto out; status = decode_deallocate(xdr, res); + if (status) + goto out; + decode_getfattr(xdr, res->falloc_fattr, res->falloc_server); out: return status; } diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 51c2dbd..e42be52 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -4,7 +4,6 @@ */ #include <linux/module.h> #include <linux/nfs_fs.h> -#include <linux/nfs_idmap.h> #include <linux/nfs_mount.h> #include <linux/sunrpc/addr.h> #include <linux/sunrpc/auth.h> @@ -15,6 +14,7 @@ #include "callback.h" #include "delegation.h" #include "nfs4session.h" +#include "nfs4idmap.h" #include "pnfs.h" #include "netns.h" diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 619eca3..f58c17b 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -10,6 +10,8 @@ #include "fscache.h" #include "pnfs.h" +#include "nfstrace.h" + #ifdef CONFIG_NFS_V4_2 #include "nfs42.h" #endif @@ -57,7 +59,7 @@ nfs4_file_open(struct inode *inode, struct file *filp) if (openflags & O_TRUNC) { attr.ia_valid |= ATTR_SIZE; attr.ia_size = 0; - nfs_wb_all(inode); + nfs_sync_inode(inode); } inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, &opened); @@ -100,6 +102,9 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) int ret; struct inode *inode = file_inode(file); + trace_nfs_fsync_enter(inode); + + nfs_inode_dio_wait(inode); do { ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret != 0) @@ -107,7 +112,7 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) mutex_lock(&inode->i_mutex); ret = nfs_file_fsync_commit(file, start, end, datasync); if (!ret) - ret = pnfs_layoutcommit_inode(inode, true); + ret = pnfs_sync_inode(inode, !!datasync); mutex_unlock(&inode->i_mutex); /* * If nfs_file_fsync_commit detected a server reboot, then @@ -118,6 +123,7 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) end = LLONG_MAX; } while (ret == -EAGAIN); + trace_nfs_fsync_exit(inode, ret); return ret; } @@ -152,15 +158,9 @@ static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t if (ret < 0) return ret; - mutex_lock(&inode->i_mutex); if (mode & FALLOC_FL_PUNCH_HOLE) - ret = nfs42_proc_deallocate(filep, offset, len); - else - ret = nfs42_proc_allocate(filep, offset, len); - mutex_unlock(&inode->i_mutex); - - nfs_zap_caches(inode); - return ret; + return nfs42_proc_deallocate(filep, offset, len); + return nfs42_proc_allocate(filep, offset, len); } #endif /* CONFIG_NFS_V4_2 */ diff --git a/fs/nfs/idmap.c b/fs/nfs/nfs4idmap.c index 857e2a9..2e1737c 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/nfs4idmap.c @@ -36,7 +36,6 @@ #include <linux/types.h> #include <linux/parser.h> #include <linux/fs.h> -#include <linux/nfs_idmap.h> #include <net/net_namespace.h> #include <linux/sunrpc/rpc_pipe_fs.h> #include <linux/nfs_fs.h> @@ -49,6 +48,7 @@ #include "internal.h" #include "netns.h" +#include "nfs4idmap.h" #include "nfs4trace.h" #define NFS_UINT_MAXLEN 11 diff --git a/include/linux/nfs_idmap.h b/fs/nfs/nfs4idmap.h index 333844e..de44d73 100644 --- a/include/linux/nfs_idmap.h +++ b/fs/nfs/nfs4idmap.h @@ -1,5 +1,5 @@ /* - * include/linux/nfs_idmap.h + * fs/nfs/nfs4idmap.h * * UID and GID to name mapping for clients. * @@ -46,19 +46,8 @@ struct nfs_server; struct nfs_fattr; struct nfs4_string; -#if IS_ENABLED(CONFIG_NFS_V4) int nfs_idmap_init(void); void nfs_idmap_quit(void); -#else -static inline int nfs_idmap_init(void) -{ - return 0; -} - -static inline void nfs_idmap_quit(void) -{} -#endif - int nfs_idmap_new(struct nfs_client *); void nfs_idmap_delete(struct nfs_client *); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 98e533f..45b35b9 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -51,7 +51,6 @@ #include <linux/namei.h> #include <linux/mount.h> #include <linux/module.h> -#include <linux/nfs_idmap.h> #include <linux/xattr.h> #include <linux/utsname.h> #include <linux/freezer.h> @@ -63,6 +62,7 @@ #include "callback.h" #include "pnfs.h" #include "netns.h" +#include "nfs4idmap.h" #include "nfs4session.h" #include "fscache.h" @@ -185,7 +185,8 @@ const u32 nfs4_fattr_bitmap[3] = { | FATTR4_WORD1_SPACE_USED | FATTR4_WORD1_TIME_ACCESS | FATTR4_WORD1_TIME_METADATA - | FATTR4_WORD1_TIME_MODIFY, + | FATTR4_WORD1_TIME_MODIFY + | FATTR4_WORD1_MOUNTED_ON_FILEID, #ifdef CONFIG_NFS_V4_SECURITY_LABEL FATTR4_WORD2_SECURITY_LABEL #endif @@ -3095,16 +3096,13 @@ int nfs4_proc_get_rootfh(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *info, bool auth_probe) { - int status; + int status = 0; - switch (auth_probe) { - case false: + if (!auth_probe) status = nfs4_lookup_root(server, fhandle, info); - if (status != -NFS4ERR_WRONGSEC) - break; - default: + + if (auth_probe || status == NFS4ERR_WRONGSEC) status = nfs4_do_find_root_sec(server, fhandle, info); - } if (status == 0) status = nfs4_server_capabilities(server, fhandle); @@ -7944,6 +7942,8 @@ _nfs4_proc_getdeviceinfo(struct nfs_server *server, { struct nfs4_getdeviceinfo_args args = { .pdev = pdev, + .notify_types = NOTIFY_DEVICEID4_CHANGE | + NOTIFY_DEVICEID4_DELETE, }; struct nfs4_getdeviceinfo_res res = { .pdev = pdev, @@ -7958,6 +7958,11 @@ _nfs4_proc_getdeviceinfo(struct nfs_server *server, dprintk("--> %s\n", __func__); status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); + if (res.notification & ~args.notify_types) + dprintk("%s: unsupported notification\n", __func__); + if (res.notification != args.notify_types) + pdev->nocache = 1; + dprintk("<-- %s status=%d\n", __func__, status); return status; diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 3b2b205..2782cfc 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -42,7 +42,6 @@ #include <linux/slab.h> #include <linux/fs.h> #include <linux/nfs_fs.h> -#include <linux/nfs_idmap.h> #include <linux/kthread.h> #include <linux/module.h> #include <linux/random.h> @@ -57,6 +56,7 @@ #include "callback.h" #include "delegation.h" #include "internal.h" +#include "nfs4idmap.h" #include "nfs4session.h" #include "pnfs.h" #include "netns.h" diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c index 75090fe..6fb7cb6 100644 --- a/fs/nfs/nfs4super.c +++ b/fs/nfs/nfs4super.c @@ -3,12 +3,12 @@ */ #include <linux/init.h> #include <linux/module.h> -#include <linux/nfs_idmap.h> #include <linux/nfs4_mount.h> #include <linux/nfs_fs.h> #include "delegation.h" #include "internal.h" #include "nfs4_fs.h" +#include "nfs4idmap.h" #include "dns_resolve.h" #include "pnfs.h" #include "nfs.h" @@ -91,10 +91,11 @@ static void nfs4_evict_inode(struct inode *inode) { truncate_inode_pages_final(&inode->i_data); clear_inode(inode); - pnfs_return_layout(inode); - pnfs_destroy_layout(NFS_I(inode)); /* If we are holding a delegation, return it! */ nfs_inode_return_delegation_noreclaim(inode); + /* Note that above delegreturn would trigger pnfs return-on-close */ + pnfs_return_layout(inode); + pnfs_destroy_layout(NFS_I(inode)); /* First call standard NFS clear_inode() code */ nfs_clear_inode(inode); } diff --git a/fs/nfs/nfs4sysctl.c b/fs/nfs/nfs4sysctl.c index b6ebe7e..0fbd3ab 100644 --- a/fs/nfs/nfs4sysctl.c +++ b/fs/nfs/nfs4sysctl.c @@ -6,10 +6,10 @@ * Copyright (c) 2006 Trond Myklebust <Trond.Myklebust@netapp.com> */ #include <linux/sysctl.h> -#include <linux/nfs_idmap.h> #include <linux/nfs_fs.h> #include "nfs4_fs.h" +#include "nfs4idmap.h" #include "callback.h" static const int nfs_set_port_min = 0; diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 5c399ec..0aea978 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -52,10 +52,10 @@ #include <linux/nfs.h> #include <linux/nfs4.h> #include <linux/nfs_fs.h> -#include <linux/nfs_idmap.h> #include "nfs4_fs.h" #include "internal.h" +#include "nfs4idmap.h" #include "nfs4session.h" #include "pnfs.h" #include "netns.h" @@ -1920,7 +1920,7 @@ encode_getdeviceinfo(struct xdr_stream *xdr, p = reserve_space(xdr, 4 + 4); *p++ = cpu_to_be32(1); /* bitmap length */ - *p++ = cpu_to_be32(NOTIFY_DEVICEID4_CHANGE | NOTIFY_DEVICEID4_DELETE); + *p++ = cpu_to_be32(args->notify_types); } static void @@ -5753,8 +5753,9 @@ out_overflow: #if defined(CONFIG_NFS_V4_1) static int decode_getdeviceinfo(struct xdr_stream *xdr, - struct pnfs_device *pdev) + struct nfs4_getdeviceinfo_res *res) { + struct pnfs_device *pdev = res->pdev; __be32 *p; uint32_t len, type; int status; @@ -5802,12 +5803,7 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr, if (unlikely(!p)) goto out_overflow; - if (be32_to_cpup(p++) & - ~(NOTIFY_DEVICEID4_CHANGE | NOTIFY_DEVICEID4_DELETE)) { - dprintk("%s: unsupported notification\n", - __func__); - } - + res->notification = be32_to_cpup(p++); for (i = 1; i < len; i++) { if (be32_to_cpup(p++)) { dprintk("%s: unsupported notification\n", @@ -7061,7 +7057,7 @@ static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, status = decode_sequence(xdr, &res->seq_res, rqstp); if (status != 0) goto out; - status = decode_getdeviceinfo(xdr, res->pdev); + status = decode_getdeviceinfo(xdr, res); out: return status; } @@ -7365,6 +7361,11 @@ nfs4_stat_to_errno(int stat) .p_name = #proc, \ } +#define STUB(proc) \ +[NFSPROC4_CLNT_##proc] = { \ + .p_name = #proc, \ +} + struct rpc_procinfo nfs4_procedures[] = { PROC(READ, enc_read, dec_read), PROC(WRITE, enc_write, dec_write), @@ -7417,6 +7418,7 @@ struct rpc_procinfo nfs4_procedures[] = { PROC(SECINFO_NO_NAME, enc_secinfo_no_name, dec_secinfo_no_name), PROC(TEST_STATEID, enc_test_stateid, dec_test_stateid), PROC(FREE_STATEID, enc_free_stateid, dec_free_stateid), + STUB(GETDEVICELIST), PROC(BIND_CONN_TO_SESSION, enc_bind_conn_to_session, dec_bind_conn_to_session), PROC(DESTROY_CLIENTID, enc_destroy_clientid, dec_destroy_clientid), diff --git a/fs/nfs/nfstrace.c b/fs/nfs/nfstrace.c index 4eb0aea..c74f7af 100644 --- a/fs/nfs/nfstrace.c +++ b/fs/nfs/nfstrace.c @@ -7,3 +7,6 @@ #define CREATE_TRACE_POINTS #include "nfstrace.h" + +EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_fsync_enter); +EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_fsync_exit); diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index 24e1d74..5aaed36 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -57,7 +57,7 @@ objio_free_deviceid_node(struct nfs4_deviceid_node *d) dprintk("%s: free od=%p\n", __func__, de->od.od); osduld_put_device(de->od.od); - kfree(de); + kfree_rcu(d, rcu); } struct objio_segment { @@ -637,6 +637,8 @@ static struct pnfs_layoutdriver_type objlayout_type = { .pg_read_ops = &objio_pg_read_ops, .pg_write_ops = &objio_pg_write_ops, + .sync = pnfs_generic_sync, + .free_deviceid_node = objio_free_deviceid_node, .encode_layoutcommit = objlayout_encode_layoutcommit, diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 4f802b0..2306062 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1090,6 +1090,7 @@ bool pnfs_roc(struct inode *ino) pnfs_get_layout_hdr(lo); /* matched in pnfs_roc_release */ spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&tmp_list); + pnfs_layoutcommit_inode(ino, true); return true; out_noroc: @@ -1104,8 +1105,10 @@ out_noroc: } } spin_unlock(&ino->i_lock); - if (layoutreturn) + if (layoutreturn) { + pnfs_layoutcommit_inode(ino, true); pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true); + } return false; } @@ -1841,7 +1844,8 @@ void pnfs_ld_write_done(struct nfs_pgio_header *hdr) { trace_nfs4_pnfs_write(hdr, hdr->pnfs_error); if (!hdr->pnfs_error) { - pnfs_set_layoutcommit(hdr); + pnfs_set_layoutcommit(hdr->inode, hdr->lseg, + hdr->mds_offset + hdr->res.count); hdr->mds_ops->rpc_call_done(&hdr->task, hdr); } else pnfs_ld_handle_write_error(hdr); @@ -1902,7 +1906,6 @@ static void pnfs_writehdr_free(struct nfs_pgio_header *hdr) pnfs_put_lseg(hdr->lseg); nfs_pgio_header_free(hdr); } -EXPORT_SYMBOL_GPL(pnfs_writehdr_free); int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) @@ -2032,7 +2035,6 @@ static void pnfs_readhdr_free(struct nfs_pgio_header *hdr) pnfs_put_lseg(hdr->lseg); nfs_pgio_header_free(hdr); } -EXPORT_SYMBOL_GPL(pnfs_readhdr_free); int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) @@ -2099,64 +2101,34 @@ void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg) EXPORT_SYMBOL_GPL(pnfs_set_lo_fail); void -pnfs_set_layoutcommit(struct nfs_pgio_header *hdr) +pnfs_set_layoutcommit(struct inode *inode, struct pnfs_layout_segment *lseg, + loff_t end_pos) { - struct inode *inode = hdr->inode; struct nfs_inode *nfsi = NFS_I(inode); - loff_t end_pos = hdr->mds_offset + hdr->res.count; bool mark_as_dirty = false; spin_lock(&inode->i_lock); if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { - mark_as_dirty = true; - dprintk("%s: Set layoutcommit for inode %lu ", - __func__, inode->i_ino); - } - if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &hdr->lseg->pls_flags)) { - /* references matched in nfs4_layoutcommit_release */ - pnfs_get_lseg(hdr->lseg); - } - if (end_pos > nfsi->layout->plh_lwb) nfsi->layout->plh_lwb = end_pos; - spin_unlock(&inode->i_lock); - dprintk("%s: lseg %p end_pos %llu\n", - __func__, hdr->lseg, nfsi->layout->plh_lwb); - - /* if pnfs_layoutcommit_inode() runs between inode locks, the next one - * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */ - if (mark_as_dirty) - mark_inode_dirty_sync(inode); -} -EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit); - -void pnfs_commit_set_layoutcommit(struct nfs_commit_data *data) -{ - struct inode *inode = data->inode; - struct nfs_inode *nfsi = NFS_I(inode); - bool mark_as_dirty = false; - - spin_lock(&inode->i_lock); - if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { mark_as_dirty = true; dprintk("%s: Set layoutcommit for inode %lu ", __func__, inode->i_ino); - } - if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &data->lseg->pls_flags)) { + } else if (end_pos > nfsi->layout->plh_lwb) + nfsi->layout->plh_lwb = end_pos; + if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) { /* references matched in nfs4_layoutcommit_release */ - pnfs_get_lseg(data->lseg); + pnfs_get_lseg(lseg); } - if (data->lwb > nfsi->layout->plh_lwb) - nfsi->layout->plh_lwb = data->lwb; spin_unlock(&inode->i_lock); dprintk("%s: lseg %p end_pos %llu\n", - __func__, data->lseg, nfsi->layout->plh_lwb); + __func__, lseg, nfsi->layout->plh_lwb); /* if pnfs_layoutcommit_inode() runs between inode locks, the next one * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */ if (mark_as_dirty) mark_inode_dirty_sync(inode); } -EXPORT_SYMBOL_GPL(pnfs_commit_set_layoutcommit); +EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit); void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data) { @@ -2216,7 +2188,6 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) pnfs_list_write_lseg(inode, &data->lseg_list); end_pos = nfsi->layout->plh_lwb; - nfsi->layout->plh_lwb = 0; nfs4_stateid_copy(&data->args.stateid, &nfsi->layout->plh_stateid); spin_unlock(&inode->i_lock); @@ -2233,11 +2204,11 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) status = ld->prepare_layoutcommit(&data->args); if (status) { spin_lock(&inode->i_lock); - if (end_pos < nfsi->layout->plh_lwb) + set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags); + if (end_pos > nfsi->layout->plh_lwb) nfsi->layout->plh_lwb = end_pos; spin_unlock(&inode->i_lock); put_rpccred(data->cred); - set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags); goto clear_layoutcommitting; } } @@ -2258,6 +2229,13 @@ clear_layoutcommitting: } EXPORT_SYMBOL_GPL(pnfs_layoutcommit_inode); +int +pnfs_generic_sync(struct inode *inode, bool datasync) +{ + return pnfs_layoutcommit_inode(inode, true); +} +EXPORT_SYMBOL_GPL(pnfs_generic_sync); + struct nfs4_threshold *pnfs_mdsthreshold_alloc(void) { struct nfs4_threshold *thp; diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 084c914..1e6308f 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -155,6 +155,8 @@ struct pnfs_layoutdriver_type { int how, struct nfs_commit_info *cinfo); + int (*sync)(struct inode *inode, bool datasync); + /* * Return PNFS_ATTEMPTED to indicate the layout code has attempted * I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS @@ -203,6 +205,7 @@ struct pnfs_device { struct page **pages; unsigned int pgbase; unsigned int pglen; /* reply buffer length */ + unsigned char nocache : 1;/* May not be cached */ }; #define NFS4_PNFS_GETDEVLIST_MAXNUM 16 @@ -263,10 +266,11 @@ bool pnfs_roc(struct inode *ino); void pnfs_roc_release(struct inode *ino); void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task); -void pnfs_set_layoutcommit(struct nfs_pgio_header *); -void pnfs_commit_set_layoutcommit(struct nfs_commit_data *data); +void pnfs_set_layoutcommit(struct inode *, struct pnfs_layout_segment *, loff_t); void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data); int pnfs_layoutcommit_inode(struct inode *inode, bool sync); +int pnfs_generic_sync(struct inode *inode, bool datasync); +int pnfs_nfs_generic_sync(struct inode *inode, bool datasync); int _pnfs_return_layout(struct inode *); int pnfs_commit_and_return_layout(struct inode *); void pnfs_ld_write_done(struct nfs_pgio_header *); @@ -291,6 +295,7 @@ void pnfs_error_mark_layout_for_return(struct inode *inode, enum { NFS_DEVICEID_INVALID = 0, /* set when MDS clientid recalled */ NFS_DEVICEID_UNAVAILABLE, /* device temporarily unavailable */ + NFS_DEVICEID_NOCACHE, /* device may not be cached */ }; /* pnfs_dev.c */ @@ -302,6 +307,7 @@ struct nfs4_deviceid_node { unsigned long flags; unsigned long timestamp_unavailable; struct nfs4_deviceid deviceid; + struct rcu_head rcu; atomic_t ref; }; @@ -486,6 +492,14 @@ pnfs_ld_read_whole_page(struct inode *inode) return NFS_SERVER(inode)->pnfs_curr_ld->flags & PNFS_READ_WHOLE_PAGE; } +static inline int +pnfs_sync_inode(struct inode *inode, bool datasync) +{ + if (!pnfs_enabled_sb(NFS_SERVER(inode))) + return 0; + return NFS_SERVER(inode)->pnfs_curr_ld->sync(inode, datasync); +} + static inline bool pnfs_layoutcommit_outstanding(struct inode *inode) { @@ -568,6 +582,12 @@ pnfs_ld_read_whole_page(struct inode *inode) return false; } +static inline int +pnfs_sync_inode(struct inode *inode, bool datasync) +{ + return 0; +} + static inline bool pnfs_roc(struct inode *ino) { diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c index aa2ec00..2961fcd 100644 --- a/fs/nfs/pnfs_dev.c +++ b/fs/nfs/pnfs_dev.c @@ -149,6 +149,8 @@ nfs4_get_device_info(struct nfs_server *server, */ d = server->pnfs_curr_ld->alloc_deviceid_node(server, pdev, gfp_flags); + if (d && pdev->nocache) + set_bit(NFS_DEVICEID_NOCACHE, &d->flags); out_free_pages: for (i = 0; i < max_pages; i++) @@ -175,8 +177,8 @@ __nfs4_find_get_deviceid(struct nfs_server *server, rcu_read_lock(); d = _lookup_deviceid(server->pnfs_curr_ld, server->nfs_client, id, hash); - if (d != NULL) - atomic_inc(&d->ref); + if (d != NULL && !atomic_inc_not_zero(&d->ref)) + d = NULL; rcu_read_unlock(); return d; } @@ -235,12 +237,11 @@ nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld, return; } hlist_del_init_rcu(&d->node); + clear_bit(NFS_DEVICEID_NOCACHE, &d->flags); spin_unlock(&nfs4_deviceid_lock); - synchronize_rcu(); /* balance the initial ref set in pnfs_insert_deviceid */ - if (atomic_dec_and_test(&d->ref)) - d->ld->free_deviceid_node(d); + nfs4_put_deviceid_node(d); } EXPORT_SYMBOL_GPL(nfs4_delete_deviceid); @@ -271,6 +272,11 @@ EXPORT_SYMBOL_GPL(nfs4_init_deviceid_node); bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *d) { + if (test_bit(NFS_DEVICEID_NOCACHE, &d->flags)) { + if (atomic_add_unless(&d->ref, -1, 2)) + return false; + nfs4_delete_deviceid(d->ld, d->nfs_client, &d->deviceid); + } if (!atomic_dec_and_test(&d->ref)) return false; d->ld->free_deviceid_node(d); @@ -314,6 +320,7 @@ _deviceid_purge_client(const struct nfs_client *clp, long hash) if (d->nfs_client == clp && atomic_read(&d->ref)) { hlist_del_init_rcu(&d->node); hlist_add_head(&d->tmpnode, &tmp); + clear_bit(NFS_DEVICEID_NOCACHE, &d->flags); } rcu_read_unlock(); spin_unlock(&nfs4_deviceid_lock); @@ -321,12 +328,10 @@ _deviceid_purge_client(const struct nfs_client *clp, long hash) if (hlist_empty(&tmp)) return; - synchronize_rcu(); while (!hlist_empty(&tmp)) { d = hlist_entry(tmp.first, struct nfs4_deviceid_node, tmpnode); hlist_del(&d->tmpnode); - if (atomic_dec_and_test(&d->ref)) - d->ld->free_deviceid_node(d); + nfs4_put_deviceid_node(d); } } diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 54e36b3..f37e25b 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -561,7 +561,7 @@ static bool load_v3_ds_connect(void) return(get_v3_ds_connect != NULL); } -void __exit nfs4_pnfs_v3_ds_connect_unload(void) +void nfs4_pnfs_v3_ds_connect_unload(void) { if (get_v3_ds_connect) { symbol_put(nfs3_set_ds_client); @@ -868,3 +868,13 @@ pnfs_layout_mark_request_commit(struct nfs_page *req, nfs_request_add_commit_list(req, list, cinfo); } EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit); + +int +pnfs_nfs_generic_sync(struct inode *inode, bool datasync) +{ + if (datasync) + return 0; + return pnfs_layoutcommit_inode(inode, true); +} +EXPORT_SYMBOL_GPL(pnfs_nfs_generic_sync); + diff --git a/fs/nfs/read.c b/fs/nfs/read.c index a5b7427..ae0ff7a 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -284,7 +284,7 @@ int nfs_readpage(struct file *file, struct page *page) dprintk("NFS: nfs_readpage (%p %ld@%lu)\n", page, PAGE_CACHE_SIZE, page_file_index(page)); nfs_inc_stats(inode, NFSIOS_VFSREADPAGE); - nfs_inc_stats(inode, NFSIOS_READPAGES); + nfs_add_stats(inode, NFSIOS_READPAGES, 1); /* * Try to flush any pending writes to the file.. diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 21f8f52..f175b83 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -43,7 +43,6 @@ #include <linux/seq_file.h> #include <linux/mount.h> #include <linux/namei.h> -#include <linux/nfs_idmap.h> #include <linux/vfs.h> #include <linux/inet.h> #include <linux/in6.h> @@ -2193,7 +2192,7 @@ nfs_compare_remount_data(struct nfs_server *nfss, data->version != nfss->nfs_client->rpc_ops->version || data->minorversion != nfss->nfs_client->cl_minorversion || data->retrans != nfss->client->cl_timeout->to_retries || - data->selected_flavor != nfss->client->cl_auth->au_flavor || + !nfs_auth_info_match(&data->auth_info, nfss->client->cl_auth->au_flavor) || data->acregmin != nfss->acregmin / HZ || data->acregmax != nfss->acregmax / HZ || data->acdirmin != nfss->acdirmin / HZ || @@ -2241,7 +2240,6 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data) data->wsize = nfss->wsize; data->retrans = nfss->client->cl_timeout->to_retries; data->selected_flavor = nfss->client->cl_auth->au_flavor; - data->auth_info = nfss->auth_info; data->acregmin = nfss->acregmin / HZ; data->acregmax = nfss->acregmax / HZ; data->acdirmin = nfss->acdirmin / HZ; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 3612b46..d12a4be 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -580,7 +580,7 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, st int ret; nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); - nfs_inc_stats(inode, NFSIOS_WRITEPAGES); + nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1); nfs_pageio_cond_complete(pgio, page_file_index(page)); ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE); @@ -1840,17 +1840,16 @@ EXPORT_SYMBOL_GPL(nfs_write_inode); */ int nfs_wb_all(struct inode *inode) { - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = LONG_MAX, - .range_start = 0, - .range_end = LLONG_MAX, - }; int ret; trace_nfs_writeback_inode_enter(inode); - ret = sync_inode(inode, &wbc); + ret = filemap_write_and_wait(inode->i_mapping); + if (!ret) { + ret = nfs_commit_inode(inode, FLUSH_SYNC); + if (!ret) + pnfs_sync_inode(inode, true); + } trace_nfs_writeback_inode_exit(inode, ret); return ret; diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 410abd1..b95f914 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -511,6 +511,7 @@ extern int nfs_updatepage(struct file *, struct page *, unsigned int, unsigned * Try to write back everything synchronously (but check the * return value!) */ +extern int nfs_sync_inode(struct inode *inode); extern int nfs_wb_all(struct inode *inode); extern int nfs_wb_page(struct inode *inode, struct page* page); extern int nfs_wb_page_cancel(struct inode *inode, struct page* page); diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 4cb3eaa..93ab607 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -255,11 +255,13 @@ struct nfs4_layoutget { struct nfs4_getdeviceinfo_args { struct nfs4_sequence_args seq_args; struct pnfs_device *pdev; + __u32 notify_types; }; struct nfs4_getdeviceinfo_res { struct nfs4_sequence_res seq_res; struct pnfs_device *pdev; + __u32 notification; }; struct nfs4_layoutcommit_args { @@ -1271,11 +1273,15 @@ struct nfs42_falloc_args { nfs4_stateid falloc_stateid; u64 falloc_offset; u64 falloc_length; + const u32 *falloc_bitmask; }; struct nfs42_falloc_res { struct nfs4_sequence_res seq_res; unsigned int status; + + struct nfs_fattr *falloc_fattr; + const struct nfs_server *falloc_server; }; struct nfs42_seek_args { diff --git a/include/linux/sunrpc/msg_prot.h b/include/linux/sunrpc/msg_prot.h index aadc6a0..8073713 100644 --- a/include/linux/sunrpc/msg_prot.h +++ b/include/linux/sunrpc/msg_prot.h @@ -142,12 +142,18 @@ typedef __be32 rpc_fraghdr; (RPC_REPHDRSIZE + (2 + RPC_MAX_AUTH_SIZE/4)) /* - * RFC1833/RFC3530 rpcbind (v3+) well-known netid's. + * Well-known netids. See: + * + * http://www.iana.org/assignments/rpc-netids/rpc-netids.xhtml */ #define RPCBIND_NETID_UDP "udp" #define RPCBIND_NETID_TCP "tcp" +#define RPCBIND_NETID_RDMA "rdma" +#define RPCBIND_NETID_SCTP "sctp" #define RPCBIND_NETID_UDP6 "udp6" #define RPCBIND_NETID_TCP6 "tcp6" +#define RPCBIND_NETID_RDMA6 "rdma6" +#define RPCBIND_NETID_SCTP6 "sctp6" #define RPCBIND_NETID_LOCAL "local" /* diff --git a/include/linux/sunrpc/xprtrdma.h b/include/linux/sunrpc/xprtrdma.h index 64a0a0a..c984c85 100644 --- a/include/linux/sunrpc/xprtrdma.h +++ b/include/linux/sunrpc/xprtrdma.h @@ -41,11 +41,6 @@ #define _LINUX_SUNRPC_XPRTRDMA_H /* - * rpcbind (v3+) RDMA netid. - */ -#define RPCBIND_NETID_RDMA "rdma" - -/* * Constants. Max RPC/NFS header is big enough to account for * additional marshaling buffers passed down by Linux client. * diff --git a/include/uapi/linux/nfs_idmap.h b/include/uapi/linux/nfs_idmap.h index 8d4b1c7..038e36c 100644 --- a/include/uapi/linux/nfs_idmap.h +++ b/include/uapi/linux/nfs_idmap.h @@ -1,5 +1,5 @@ /* - * include/linux/nfs_idmap.h + * include/uapi/linux/nfs_idmap.h * * UID and GID to name mapping for clients. * diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index b91fd9c..337ca85 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -89,8 +89,8 @@ __rpc_add_timer(struct rpc_wait_queue *queue, struct rpc_task *task) if (!task->tk_timeout) return; - dprintk("RPC: %5u setting alarm for %lu ms\n", - task->tk_pid, task->tk_timeout * 1000 / HZ); + dprintk("RPC: %5u setting alarm for %u ms\n", + task->tk_pid, jiffies_to_msecs(task->tk_timeout)); task->u.tk_wait.expires = jiffies + task->tk_timeout; if (list_empty(&queue->timer_list.list) || time_before(task->u.tk_wait.expires, queue->timer_list.expires)) diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 9949722..1d4fe24 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -326,6 +326,15 @@ out_unlock: xprt_clear_locked(xprt); } +static void xprt_task_clear_bytes_sent(struct rpc_task *task) +{ + if (task != NULL) { + struct rpc_rqst *req = task->tk_rqstp; + if (req != NULL) + req->rq_bytes_sent = 0; + } +} + /** * xprt_release_xprt - allow other requests to use a transport * @xprt: transport with other tasks potentially waiting @@ -336,11 +345,7 @@ out_unlock: void xprt_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task) { if (xprt->snd_task == task) { - if (task != NULL) { - struct rpc_rqst *req = task->tk_rqstp; - if (req != NULL) - req->rq_bytes_sent = 0; - } + xprt_task_clear_bytes_sent(task); xprt_clear_locked(xprt); __xprt_lock_write_next(xprt); } @@ -358,11 +363,7 @@ EXPORT_SYMBOL_GPL(xprt_release_xprt); void xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task) { if (xprt->snd_task == task) { - if (task != NULL) { - struct rpc_rqst *req = task->tk_rqstp; - if (req != NULL) - req->rq_bytes_sent = 0; - } + xprt_task_clear_bytes_sent(task); xprt_clear_locked(xprt); __xprt_lock_write_next_cong(xprt); } @@ -700,6 +701,7 @@ bool xprt_lock_connect(struct rpc_xprt *xprt, goto out; if (xprt->snd_task != task) goto out; + xprt_task_clear_bytes_sent(task); xprt->snd_task = cookie; ret = true; out: diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile index da5136f..579f72b 100644 --- a/net/sunrpc/xprtrdma/Makefile +++ b/net/sunrpc/xprtrdma/Makefile @@ -1,6 +1,7 @@ obj-$(CONFIG_SUNRPC_XPRT_RDMA_CLIENT) += xprtrdma.o -xprtrdma-y := transport.o rpc_rdma.o verbs.o +xprtrdma-y := transport.o rpc_rdma.o verbs.o \ + fmr_ops.o frwr_ops.o physical_ops.o obj-$(CONFIG_SUNRPC_XPRT_RDMA_SERVER) += svcrdma.o diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c new file mode 100644 index 0000000..302d4eb --- /dev/null +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -0,0 +1,208 @@ +/* + * Copyright (c) 2015 Oracle. All rights reserved. + * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. + */ + +/* Lightweight memory registration using Fast Memory Regions (FMR). + * Referred to sometimes as MTHCAFMR mode. + * + * FMR uses synchronous memory registration and deregistration. + * FMR registration is known to be fast, but FMR deregistration + * can take tens of usecs to complete. + */ + +#include "xprt_rdma.h" + +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) +# define RPCDBG_FACILITY RPCDBG_TRANS +#endif + +/* Maximum scatter/gather per FMR */ +#define RPCRDMA_MAX_FMR_SGES (64) + +static int +fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, + struct rpcrdma_create_data_internal *cdata) +{ + return 0; +} + +/* FMR mode conveys up to 64 pages of payload per chunk segment. + */ +static size_t +fmr_op_maxpages(struct rpcrdma_xprt *r_xprt) +{ + return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, + rpcrdma_max_segments(r_xprt) * RPCRDMA_MAX_FMR_SGES); +} + +static int +fmr_op_init(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ; + struct ib_fmr_attr fmr_attr = { + .max_pages = RPCRDMA_MAX_FMR_SGES, + .max_maps = 1, + .page_shift = PAGE_SHIFT + }; + struct ib_pd *pd = r_xprt->rx_ia.ri_pd; + struct rpcrdma_mw *r; + int i, rc; + + INIT_LIST_HEAD(&buf->rb_mws); + INIT_LIST_HEAD(&buf->rb_all); + + i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS; + dprintk("RPC: %s: initializing %d FMRs\n", __func__, i); + + while (i--) { + r = kzalloc(sizeof(*r), GFP_KERNEL); + if (!r) + return -ENOMEM; + + r->r.fmr = ib_alloc_fmr(pd, mr_access_flags, &fmr_attr); + if (IS_ERR(r->r.fmr)) + goto out_fmr_err; + + list_add(&r->mw_list, &buf->rb_mws); + list_add(&r->mw_all, &buf->rb_all); + } + return 0; + +out_fmr_err: + rc = PTR_ERR(r->r.fmr); + dprintk("RPC: %s: ib_alloc_fmr status %i\n", __func__, rc); + kfree(r); + return rc; +} + +/* Use the ib_map_phys_fmr() verb to register a memory region + * for remote access via RDMA READ or RDMA WRITE. + */ +static int +fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, + int nsegs, bool writing) +{ + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct ib_device *device = ia->ri_id->device; + enum dma_data_direction direction = rpcrdma_data_dir(writing); + struct rpcrdma_mr_seg *seg1 = seg; + struct rpcrdma_mw *mw = seg1->rl_mw; + u64 physaddrs[RPCRDMA_MAX_DATA_SEGS]; + int len, pageoff, i, rc; + + pageoff = offset_in_page(seg1->mr_offset); + seg1->mr_offset -= pageoff; /* start of page */ + seg1->mr_len += pageoff; + len = -pageoff; + if (nsegs > RPCRDMA_MAX_FMR_SGES) + nsegs = RPCRDMA_MAX_FMR_SGES; + for (i = 0; i < nsegs;) { + rpcrdma_map_one(device, seg, direction); + physaddrs[i] = seg->mr_dma; + len += seg->mr_len; + ++seg; + ++i; + /* Check for holes */ + if ((i < nsegs && offset_in_page(seg->mr_offset)) || + offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) + break; + } + + rc = ib_map_phys_fmr(mw->r.fmr, physaddrs, i, seg1->mr_dma); + if (rc) + goto out_maperr; + + seg1->mr_rkey = mw->r.fmr->rkey; + seg1->mr_base = seg1->mr_dma + pageoff; + seg1->mr_nsegs = i; + seg1->mr_len = len; + return i; + +out_maperr: + dprintk("RPC: %s: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n", + __func__, len, (unsigned long long)seg1->mr_dma, + pageoff, i, rc); + while (i--) + rpcrdma_unmap_one(device, --seg); + return rc; +} + +/* Use the ib_unmap_fmr() verb to prevent further remote + * access via RDMA READ or RDMA WRITE. + */ +static int +fmr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) +{ + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct rpcrdma_mr_seg *seg1 = seg; + struct ib_device *device; + int rc, nsegs = seg->mr_nsegs; + LIST_HEAD(l); + + list_add(&seg1->rl_mw->r.fmr->list, &l); + rc = ib_unmap_fmr(&l); + read_lock(&ia->ri_qplock); + device = ia->ri_id->device; + while (seg1->mr_nsegs--) + rpcrdma_unmap_one(device, seg++); + read_unlock(&ia->ri_qplock); + if (rc) + goto out_err; + return nsegs; + +out_err: + dprintk("RPC: %s: ib_unmap_fmr status %i\n", __func__, rc); + return nsegs; +} + +/* After a disconnect, unmap all FMRs. + * + * This is invoked only in the transport connect worker in order + * to serialize with rpcrdma_register_fmr_external(). + */ +static void +fmr_op_reset(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct rpcrdma_mw *r; + LIST_HEAD(list); + int rc; + + list_for_each_entry(r, &buf->rb_all, mw_all) + list_add(&r->r.fmr->list, &list); + + rc = ib_unmap_fmr(&list); + if (rc) + dprintk("RPC: %s: ib_unmap_fmr failed %i\n", + __func__, rc); +} + +static void +fmr_op_destroy(struct rpcrdma_buffer *buf) +{ + struct rpcrdma_mw *r; + int rc; + + while (!list_empty(&buf->rb_all)) { + r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); + list_del(&r->mw_all); + rc = ib_dealloc_fmr(r->r.fmr); + if (rc) + dprintk("RPC: %s: ib_dealloc_fmr failed %i\n", + __func__, rc); + kfree(r); + } +} + +const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = { + .ro_map = fmr_op_map, + .ro_unmap = fmr_op_unmap, + .ro_open = fmr_op_open, + .ro_maxpages = fmr_op_maxpages, + .ro_init = fmr_op_init, + .ro_reset = fmr_op_reset, + .ro_destroy = fmr_op_destroy, + .ro_displayname = "fmr", +}; diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c new file mode 100644 index 0000000..dff0481 --- /dev/null +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -0,0 +1,353 @@ +/* + * Copyright (c) 2015 Oracle. All rights reserved. + * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. + */ + +/* Lightweight memory registration using Fast Registration Work + * Requests (FRWR). Also referred to sometimes as FRMR mode. + * + * FRWR features ordered asynchronous registration and deregistration + * of arbitrarily sized memory regions. This is the fastest and safest + * but most complex memory registration mode. + */ + +#include "xprt_rdma.h" + +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) +# define RPCDBG_FACILITY RPCDBG_TRANS +#endif + +static int +__frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device, + unsigned int depth) +{ + struct rpcrdma_frmr *f = &r->r.frmr; + int rc; + + f->fr_mr = ib_alloc_fast_reg_mr(pd, depth); + if (IS_ERR(f->fr_mr)) + goto out_mr_err; + f->fr_pgl = ib_alloc_fast_reg_page_list(device, depth); + if (IS_ERR(f->fr_pgl)) + goto out_list_err; + return 0; + +out_mr_err: + rc = PTR_ERR(f->fr_mr); + dprintk("RPC: %s: ib_alloc_fast_reg_mr status %i\n", + __func__, rc); + return rc; + +out_list_err: + rc = PTR_ERR(f->fr_pgl); + dprintk("RPC: %s: ib_alloc_fast_reg_page_list status %i\n", + __func__, rc); + ib_dereg_mr(f->fr_mr); + return rc; +} + +static void +__frwr_release(struct rpcrdma_mw *r) +{ + int rc; + + rc = ib_dereg_mr(r->r.frmr.fr_mr); + if (rc) + dprintk("RPC: %s: ib_dereg_mr status %i\n", + __func__, rc); + ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); +} + +static int +frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, + struct rpcrdma_create_data_internal *cdata) +{ + struct ib_device_attr *devattr = &ia->ri_devattr; + int depth, delta; + + ia->ri_max_frmr_depth = + min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, + devattr->max_fast_reg_page_list_len); + dprintk("RPC: %s: device's max FR page list len = %u\n", + __func__, ia->ri_max_frmr_depth); + + /* Add room for frmr register and invalidate WRs. + * 1. FRMR reg WR for head + * 2. FRMR invalidate WR for head + * 3. N FRMR reg WRs for pagelist + * 4. N FRMR invalidate WRs for pagelist + * 5. FRMR reg WR for tail + * 6. FRMR invalidate WR for tail + * 7. The RDMA_SEND WR + */ + depth = 7; + + /* Calculate N if the device max FRMR depth is smaller than + * RPCRDMA_MAX_DATA_SEGS. + */ + if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) { + delta = RPCRDMA_MAX_DATA_SEGS - ia->ri_max_frmr_depth; + do { + depth += 2; /* FRMR reg + invalidate */ + delta -= ia->ri_max_frmr_depth; + } while (delta > 0); + } + + ep->rep_attr.cap.max_send_wr *= depth; + if (ep->rep_attr.cap.max_send_wr > devattr->max_qp_wr) { + cdata->max_requests = devattr->max_qp_wr / depth; + if (!cdata->max_requests) + return -EINVAL; + ep->rep_attr.cap.max_send_wr = cdata->max_requests * + depth; + } + + return 0; +} + +/* FRWR mode conveys a list of pages per chunk segment. The + * maximum length of that list is the FRWR page list depth. + */ +static size_t +frwr_op_maxpages(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + + return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, + rpcrdma_max_segments(r_xprt) * ia->ri_max_frmr_depth); +} + +/* If FAST_REG or LOCAL_INV failed, indicate the frmr needs to be reset. */ +static void +frwr_sendcompletion(struct ib_wc *wc) +{ + struct rpcrdma_mw *r; + + if (likely(wc->status == IB_WC_SUCCESS)) + return; + + /* WARNING: Only wr_id and status are reliable at this point */ + r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; + dprintk("RPC: %s: frmr %p (stale), status %d\n", + __func__, r, wc->status); + r->r.frmr.fr_state = FRMR_IS_STALE; +} + +static int +frwr_op_init(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct ib_device *device = r_xprt->rx_ia.ri_id->device; + unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth; + struct ib_pd *pd = r_xprt->rx_ia.ri_pd; + int i; + + INIT_LIST_HEAD(&buf->rb_mws); + INIT_LIST_HEAD(&buf->rb_all); + + i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS; + dprintk("RPC: %s: initializing %d FRMRs\n", __func__, i); + + while (i--) { + struct rpcrdma_mw *r; + int rc; + + r = kzalloc(sizeof(*r), GFP_KERNEL); + if (!r) + return -ENOMEM; + + rc = __frwr_init(r, pd, device, depth); + if (rc) { + kfree(r); + return rc; + } + + list_add(&r->mw_list, &buf->rb_mws); + list_add(&r->mw_all, &buf->rb_all); + r->mw_sendcompletion = frwr_sendcompletion; + } + + return 0; +} + +/* Post a FAST_REG Work Request to register a memory region + * for remote access via RDMA READ or RDMA WRITE. + */ +static int +frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, + int nsegs, bool writing) +{ + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct ib_device *device = ia->ri_id->device; + enum dma_data_direction direction = rpcrdma_data_dir(writing); + struct rpcrdma_mr_seg *seg1 = seg; + struct rpcrdma_mw *mw = seg1->rl_mw; + struct rpcrdma_frmr *frmr = &mw->r.frmr; + struct ib_mr *mr = frmr->fr_mr; + struct ib_send_wr fastreg_wr, *bad_wr; + u8 key; + int len, pageoff; + int i, rc; + int seg_len; + u64 pa; + int page_no; + + pageoff = offset_in_page(seg1->mr_offset); + seg1->mr_offset -= pageoff; /* start of page */ + seg1->mr_len += pageoff; + len = -pageoff; + if (nsegs > ia->ri_max_frmr_depth) + nsegs = ia->ri_max_frmr_depth; + for (page_no = i = 0; i < nsegs;) { + rpcrdma_map_one(device, seg, direction); + pa = seg->mr_dma; + for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) { + frmr->fr_pgl->page_list[page_no++] = pa; + pa += PAGE_SIZE; + } + len += seg->mr_len; + ++seg; + ++i; + /* Check for holes */ + if ((i < nsegs && offset_in_page(seg->mr_offset)) || + offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) + break; + } + dprintk("RPC: %s: Using frmr %p to map %d segments (%d bytes)\n", + __func__, mw, i, len); + + frmr->fr_state = FRMR_IS_VALID; + + memset(&fastreg_wr, 0, sizeof(fastreg_wr)); + fastreg_wr.wr_id = (unsigned long)(void *)mw; + fastreg_wr.opcode = IB_WR_FAST_REG_MR; + fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma + pageoff; + fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl; + fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT; + fastreg_wr.wr.fast_reg.page_list_len = page_no; + fastreg_wr.wr.fast_reg.length = len; + fastreg_wr.wr.fast_reg.access_flags = writing ? + IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : + IB_ACCESS_REMOTE_READ; + key = (u8)(mr->rkey & 0x000000FF); + ib_update_fast_reg_key(mr, ++key); + fastreg_wr.wr.fast_reg.rkey = mr->rkey; + + DECR_CQCOUNT(&r_xprt->rx_ep); + rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr); + if (rc) + goto out_senderr; + + seg1->mr_rkey = mr->rkey; + seg1->mr_base = seg1->mr_dma + pageoff; + seg1->mr_nsegs = i; + seg1->mr_len = len; + return i; + +out_senderr: + dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc); + ib_update_fast_reg_key(mr, --key); + frmr->fr_state = FRMR_IS_INVALID; + while (i--) + rpcrdma_unmap_one(device, --seg); + return rc; +} + +/* Post a LOCAL_INV Work Request to prevent further remote access + * via RDMA READ or RDMA WRITE. + */ +static int +frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) +{ + struct rpcrdma_mr_seg *seg1 = seg; + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct ib_send_wr invalidate_wr, *bad_wr; + int rc, nsegs = seg->mr_nsegs; + struct ib_device *device; + + seg1->rl_mw->r.frmr.fr_state = FRMR_IS_INVALID; + + memset(&invalidate_wr, 0, sizeof(invalidate_wr)); + invalidate_wr.wr_id = (unsigned long)(void *)seg1->rl_mw; + invalidate_wr.opcode = IB_WR_LOCAL_INV; + invalidate_wr.ex.invalidate_rkey = seg1->rl_mw->r.frmr.fr_mr->rkey; + DECR_CQCOUNT(&r_xprt->rx_ep); + + read_lock(&ia->ri_qplock); + device = ia->ri_id->device; + while (seg1->mr_nsegs--) + rpcrdma_unmap_one(device, seg++); + rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr); + read_unlock(&ia->ri_qplock); + if (rc) + goto out_err; + return nsegs; + +out_err: + /* Force rpcrdma_buffer_get() to retry */ + seg1->rl_mw->r.frmr.fr_state = FRMR_IS_STALE; + dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc); + return nsegs; +} + +/* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in + * an unusable state. Find FRMRs in this state and dereg / reg + * each. FRMRs that are VALID and attached to an rpcrdma_req are + * also torn down. + * + * This gives all in-use FRMRs a fresh rkey and leaves them INVALID. + * + * This is invoked only in the transport connect worker in order + * to serialize with rpcrdma_register_frmr_external(). + */ +static void +frwr_op_reset(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct ib_device *device = r_xprt->rx_ia.ri_id->device; + unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth; + struct ib_pd *pd = r_xprt->rx_ia.ri_pd; + struct rpcrdma_mw *r; + int rc; + + list_for_each_entry(r, &buf->rb_all, mw_all) { + if (r->r.frmr.fr_state == FRMR_IS_INVALID) + continue; + + __frwr_release(r); + rc = __frwr_init(r, pd, device, depth); + if (rc) { + dprintk("RPC: %s: mw %p left %s\n", + __func__, r, + (r->r.frmr.fr_state == FRMR_IS_STALE ? + "stale" : "valid")); + continue; + } + + r->r.frmr.fr_state = FRMR_IS_INVALID; + } +} + +static void +frwr_op_destroy(struct rpcrdma_buffer *buf) +{ + struct rpcrdma_mw *r; + + while (!list_empty(&buf->rb_all)) { + r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); + list_del(&r->mw_all); + __frwr_release(r); + kfree(r); + } +} + +const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { + .ro_map = frwr_op_map, + .ro_unmap = frwr_op_unmap, + .ro_open = frwr_op_open, + .ro_maxpages = frwr_op_maxpages, + .ro_init = frwr_op_init, + .ro_reset = frwr_op_reset, + .ro_destroy = frwr_op_destroy, + .ro_displayname = "frwr", +}; diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c new file mode 100644 index 0000000..ba518af --- /dev/null +++ b/net/sunrpc/xprtrdma/physical_ops.c @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2015 Oracle. All rights reserved. + * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. + */ + +/* No-op chunk preparation. All client memory is pre-registered. + * Sometimes referred to as ALLPHYSICAL mode. + * + * Physical registration is simple because all client memory is + * pre-registered and never deregistered. This mode is good for + * adapter bring up, but is considered not safe: the server is + * trusted not to abuse its access to client memory not involved + * in RDMA I/O. + */ + +#include "xprt_rdma.h" + +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) +# define RPCDBG_FACILITY RPCDBG_TRANS +#endif + +static int +physical_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, + struct rpcrdma_create_data_internal *cdata) +{ + return 0; +} + +/* PHYSICAL memory registration conveys one page per chunk segment. + */ +static size_t +physical_op_maxpages(struct rpcrdma_xprt *r_xprt) +{ + return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, + rpcrdma_max_segments(r_xprt)); +} + +static int +physical_op_init(struct rpcrdma_xprt *r_xprt) +{ + return 0; +} + +/* The client's physical memory is already exposed for + * remote access via RDMA READ or RDMA WRITE. + */ +static int +physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, + int nsegs, bool writing) +{ + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + + rpcrdma_map_one(ia->ri_id->device, seg, + rpcrdma_data_dir(writing)); + seg->mr_rkey = ia->ri_bind_mem->rkey; + seg->mr_base = seg->mr_dma; + seg->mr_nsegs = 1; + return 1; +} + +/* Unmap a memory region, but leave it registered. + */ +static int +physical_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) +{ + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + + read_lock(&ia->ri_qplock); + rpcrdma_unmap_one(ia->ri_id->device, seg); + read_unlock(&ia->ri_qplock); + + return 1; +} + +static void +physical_op_reset(struct rpcrdma_xprt *r_xprt) +{ +} + +static void +physical_op_destroy(struct rpcrdma_buffer *buf) +{ +} + +const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = { + .ro_map = physical_op_map, + .ro_unmap = physical_op_unmap, + .ro_open = physical_op_open, + .ro_maxpages = physical_op_maxpages, + .ro_init = physical_op_init, + .ro_reset = physical_op_reset, + .ro_destroy = physical_op_destroy, + .ro_displayname = "physical", +}; diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 91ffde8..2c53ea9 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -53,6 +53,14 @@ # define RPCDBG_FACILITY RPCDBG_TRANS #endif +enum rpcrdma_chunktype { + rpcrdma_noch = 0, + rpcrdma_readch, + rpcrdma_areadch, + rpcrdma_writech, + rpcrdma_replych +}; + #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) static const char transfertypes[][12] = { "pure inline", /* no chunks */ @@ -179,6 +187,7 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, struct rpcrdma_write_array *warray = NULL; struct rpcrdma_write_chunk *cur_wchunk = NULL; __be32 *iptr = headerp->rm_body.rm_chunks; + int (*map)(struct rpcrdma_xprt *, struct rpcrdma_mr_seg *, int, bool); if (type == rpcrdma_readch || type == rpcrdma_areadch) { /* a read chunk - server will RDMA Read our memory */ @@ -201,9 +210,9 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, if (nsegs < 0) return nsegs; + map = r_xprt->rx_ia.ri_ops->ro_map; do { - n = rpcrdma_register_external(seg, nsegs, - cur_wchunk != NULL, r_xprt); + n = map(r_xprt, seg, nsegs, cur_wchunk != NULL); if (n <= 0) goto out; if (cur_rchunk) { /* read */ @@ -275,34 +284,13 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, return (unsigned char *)iptr - (unsigned char *)headerp; out: - if (r_xprt->rx_ia.ri_memreg_strategy != RPCRDMA_FRMR) { - for (pos = 0; nchunks--;) - pos += rpcrdma_deregister_external( - &req->rl_segments[pos], r_xprt); - } - return n; -} + if (r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR) + return n; -/* - * Marshal chunks. This routine returns the header length - * consumed by marshaling. - * - * Returns positive RPC/RDMA header size, or negative errno. - */ - -ssize_t -rpcrdma_marshal_chunks(struct rpc_rqst *rqst, ssize_t result) -{ - struct rpcrdma_req *req = rpcr_to_rdmar(rqst); - struct rpcrdma_msg *headerp = rdmab_to_msg(req->rl_rdmabuf); - - if (req->rl_rtype != rpcrdma_noch) - result = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf, - headerp, req->rl_rtype); - else if (req->rl_wtype != rpcrdma_noch) - result = rpcrdma_create_chunks(rqst, &rqst->rq_rcv_buf, - headerp, req->rl_wtype); - return result; + for (pos = 0; nchunks--;) + pos += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt, + &req->rl_segments[pos]); + return n; } /* @@ -397,6 +385,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) char *base; size_t rpclen, padlen; ssize_t hdrlen; + enum rpcrdma_chunktype rtype, wtype; struct rpcrdma_msg *headerp; /* @@ -433,13 +422,13 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * into pages; otherwise use reply chunks. */ if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst)) - req->rl_wtype = rpcrdma_noch; + wtype = rpcrdma_noch; else if (rqst->rq_rcv_buf.page_len == 0) - req->rl_wtype = rpcrdma_replych; + wtype = rpcrdma_replych; else if (rqst->rq_rcv_buf.flags & XDRBUF_READ) - req->rl_wtype = rpcrdma_writech; + wtype = rpcrdma_writech; else - req->rl_wtype = rpcrdma_replych; + wtype = rpcrdma_replych; /* * Chunks needed for arguments? @@ -456,16 +445,16 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * TBD check NFSv4 setacl */ if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst)) - req->rl_rtype = rpcrdma_noch; + rtype = rpcrdma_noch; else if (rqst->rq_snd_buf.page_len == 0) - req->rl_rtype = rpcrdma_areadch; + rtype = rpcrdma_areadch; else - req->rl_rtype = rpcrdma_readch; + rtype = rpcrdma_readch; /* The following simplification is not true forever */ - if (req->rl_rtype != rpcrdma_noch && req->rl_wtype == rpcrdma_replych) - req->rl_wtype = rpcrdma_noch; - if (req->rl_rtype != rpcrdma_noch && req->rl_wtype != rpcrdma_noch) { + if (rtype != rpcrdma_noch && wtype == rpcrdma_replych) + wtype = rpcrdma_noch; + if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) { dprintk("RPC: %s: cannot marshal multiple chunk lists\n", __func__); return -EIO; @@ -479,7 +468,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * When padding is in use and applies to the transfer, insert * it and change the message type. */ - if (req->rl_rtype == rpcrdma_noch) { + if (rtype == rpcrdma_noch) { padlen = rpcrdma_inline_pullup(rqst, RPCRDMA_INLINE_PAD_VALUE(rqst)); @@ -494,7 +483,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero; headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero; hdrlen += 2 * sizeof(u32); /* extra words in padhdr */ - if (req->rl_wtype != rpcrdma_noch) { + if (wtype != rpcrdma_noch) { dprintk("RPC: %s: invalid chunk list\n", __func__); return -EIO; @@ -515,18 +504,26 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * on receive. Therefore, we request a reply chunk * for non-writes wherever feasible and efficient. */ - if (req->rl_wtype == rpcrdma_noch) - req->rl_wtype = rpcrdma_replych; + if (wtype == rpcrdma_noch) + wtype = rpcrdma_replych; } } - hdrlen = rpcrdma_marshal_chunks(rqst, hdrlen); + if (rtype != rpcrdma_noch) { + hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf, + headerp, rtype); + wtype = rtype; /* simplify dprintk */ + + } else if (wtype != rpcrdma_noch) { + hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_rcv_buf, + headerp, wtype); + } if (hdrlen < 0) return hdrlen; dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd" " headerp 0x%p base 0x%p lkey 0x%x\n", - __func__, transfertypes[req->rl_wtype], hdrlen, rpclen, padlen, + __func__, transfertypes[wtype], hdrlen, rpclen, padlen, headerp, base, rdmab_lkey(req->rl_rdmabuf)); /* diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 2e192ba..54f23b1 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -157,12 +157,47 @@ static struct ctl_table sunrpc_table[] = { static struct rpc_xprt_ops xprt_rdma_procs; /* forward reference */ static void +xprt_rdma_format_addresses4(struct rpc_xprt *xprt, struct sockaddr *sap) +{ + struct sockaddr_in *sin = (struct sockaddr_in *)sap; + char buf[20]; + + snprintf(buf, sizeof(buf), "%08x", ntohl(sin->sin_addr.s_addr)); + xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL); + + xprt->address_strings[RPC_DISPLAY_NETID] = RPCBIND_NETID_RDMA; +} + +static void +xprt_rdma_format_addresses6(struct rpc_xprt *xprt, struct sockaddr *sap) +{ + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; + char buf[40]; + + snprintf(buf, sizeof(buf), "%pi6", &sin6->sin6_addr); + xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL); + + xprt->address_strings[RPC_DISPLAY_NETID] = RPCBIND_NETID_RDMA6; +} + +static void xprt_rdma_format_addresses(struct rpc_xprt *xprt) { struct sockaddr *sap = (struct sockaddr *) &rpcx_to_rdmad(xprt).addr; - struct sockaddr_in *sin = (struct sockaddr_in *)sap; - char buf[64]; + char buf[128]; + + switch (sap->sa_family) { + case AF_INET: + xprt_rdma_format_addresses4(xprt, sap); + break; + case AF_INET6: + xprt_rdma_format_addresses6(xprt, sap); + break; + default: + pr_err("rpcrdma: Unrecognized address family\n"); + return; + } (void)rpc_ntop(sap, buf, sizeof(buf)); xprt->address_strings[RPC_DISPLAY_ADDR] = kstrdup(buf, GFP_KERNEL); @@ -170,16 +205,10 @@ xprt_rdma_format_addresses(struct rpc_xprt *xprt) snprintf(buf, sizeof(buf), "%u", rpc_get_port(sap)); xprt->address_strings[RPC_DISPLAY_PORT] = kstrdup(buf, GFP_KERNEL); - xprt->address_strings[RPC_DISPLAY_PROTO] = "rdma"; - - snprintf(buf, sizeof(buf), "%08x", ntohl(sin->sin_addr.s_addr)); - xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL); - snprintf(buf, sizeof(buf), "%4hx", rpc_get_port(sap)); xprt->address_strings[RPC_DISPLAY_HEX_PORT] = kstrdup(buf, GFP_KERNEL); - /* netid */ - xprt->address_strings[RPC_DISPLAY_NETID] = "rdma"; + xprt->address_strings[RPC_DISPLAY_PROTO] = "rdma"; } static void @@ -377,7 +406,10 @@ xprt_setup_rdma(struct xprt_create *args) xprt_rdma_connect_worker); xprt_rdma_format_addresses(xprt); - xprt->max_payload = rpcrdma_max_payload(new_xprt); + xprt->max_payload = new_xprt->rx_ia.ri_ops->ro_maxpages(new_xprt); + if (xprt->max_payload == 0) + goto out4; + xprt->max_payload <<= PAGE_SHIFT; dprintk("RPC: %s: transport data payload maximum: %zu bytes\n", __func__, xprt->max_payload); @@ -552,8 +584,8 @@ xprt_rdma_free(void *buffer) for (i = 0; req->rl_nchunks;) { --req->rl_nchunks; - i += rpcrdma_deregister_external( - &req->rl_segments[i], r_xprt); + i += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt, + &req->rl_segments[i]); } rpcrdma_buffer_put(req); @@ -579,10 +611,7 @@ xprt_rdma_send_request(struct rpc_task *task) struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); int rc = 0; - if (req->rl_niovs == 0) - rc = rpcrdma_marshal_req(rqst); - else if (r_xprt->rx_ia.ri_memreg_strategy != RPCRDMA_ALLPHYSICAL) - rc = rpcrdma_marshal_chunks(rqst, 0); + rc = rpcrdma_marshal_req(rqst); if (rc < 0) goto failed_marshal; diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index e28909f..4870d27 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -50,6 +50,7 @@ #include <linux/interrupt.h> #include <linux/slab.h> #include <linux/prefetch.h> +#include <linux/sunrpc/addr.h> #include <asm/bitops.h> #include "xprt_rdma.h" @@ -62,9 +63,6 @@ # define RPCDBG_FACILITY RPCDBG_TRANS #endif -static void rpcrdma_reset_frmrs(struct rpcrdma_ia *); -static void rpcrdma_reset_fmrs(struct rpcrdma_ia *); - /* * internal functions */ @@ -188,7 +186,7 @@ static const char * const wc_status[] = { "remote access error", "remote operation error", "transport retry counter exceeded", - "RNR retrycounter exceeded", + "RNR retry counter exceeded", "local RDD violation error", "remove invalid RD request", "operation aborted", @@ -206,21 +204,17 @@ static const char * const wc_status[] = { static void rpcrdma_sendcq_process_wc(struct ib_wc *wc) { - if (likely(wc->status == IB_WC_SUCCESS)) - return; - /* WARNING: Only wr_id and status are reliable at this point */ - if (wc->wr_id == 0ULL) { - if (wc->status != IB_WC_WR_FLUSH_ERR) + if (wc->wr_id == RPCRDMA_IGNORE_COMPLETION) { + if (wc->status != IB_WC_SUCCESS && + wc->status != IB_WC_WR_FLUSH_ERR) pr_err("RPC: %s: SEND: %s\n", __func__, COMPLETION_MSG(wc->status)); } else { struct rpcrdma_mw *r; r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; - r->r.frmr.fr_state = FRMR_IS_STALE; - pr_err("RPC: %s: frmr %p (stale): %s\n", - __func__, r, COMPLETION_MSG(wc->status)); + r->mw_sendcompletion(wc); } } @@ -424,7 +418,7 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) struct rpcrdma_ia *ia = &xprt->rx_ia; struct rpcrdma_ep *ep = &xprt->rx_ep; #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) - struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr; + struct sockaddr *sap = (struct sockaddr *)&ep->rep_remote_addr; #endif struct ib_qp_attr *attr = &ia->ri_qp_attr; struct ib_qp_init_attr *iattr = &ia->ri_qp_init_attr; @@ -480,9 +474,8 @@ connected: wake_up_all(&ep->rep_connect_wait); /*FALLTHROUGH*/ default: - dprintk("RPC: %s: %pI4:%u (ep 0x%p): %s\n", - __func__, &addr->sin_addr.s_addr, - ntohs(addr->sin_port), ep, + dprintk("RPC: %s: %pIS:%u (ep 0x%p): %s\n", + __func__, sap, rpc_get_port(sap), ep, CONNECTION_MSG(event->event)); break; } @@ -491,19 +484,16 @@ connected: if (connstate == 1) { int ird = attr->max_dest_rd_atomic; int tird = ep->rep_remote_cma.responder_resources; - printk(KERN_INFO "rpcrdma: connection to %pI4:%u " - "on %s, memreg %d slots %d ird %d%s\n", - &addr->sin_addr.s_addr, - ntohs(addr->sin_port), + + pr_info("rpcrdma: connection to %pIS:%u on %s, memreg '%s', %d credits, %d responders%s\n", + sap, rpc_get_port(sap), ia->ri_id->device->name, - ia->ri_memreg_strategy, + ia->ri_ops->ro_displayname, xprt->rx_buf.rb_max_requests, ird, ird < 4 && ird < tird / 2 ? " (low!)" : ""); } else if (connstate < 0) { - printk(KERN_INFO "rpcrdma: connection to %pI4:%u closed (%d)\n", - &addr->sin_addr.s_addr, - ntohs(addr->sin_port), - connstate); + pr_info("rpcrdma: connection to %pIS:%u closed (%d)\n", + sap, rpc_get_port(sap), connstate); } #endif @@ -621,17 +611,13 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) if (memreg == RPCRDMA_FRMR) { /* Requires both frmr reg and local dma lkey */ - if ((devattr->device_cap_flags & + if (((devattr->device_cap_flags & (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) != - (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) { + (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) || + (devattr->max_fast_reg_page_list_len == 0)) { dprintk("RPC: %s: FRMR registration " "not supported by HCA\n", __func__); memreg = RPCRDMA_MTHCAFMR; - } else { - /* Mind the ia limit on FRMR page list depth */ - ia->ri_max_frmr_depth = min_t(unsigned int, - RPCRDMA_MAX_DATA_SEGS, - devattr->max_fast_reg_page_list_len); } } if (memreg == RPCRDMA_MTHCAFMR) { @@ -652,13 +638,16 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) */ switch (memreg) { case RPCRDMA_FRMR: + ia->ri_ops = &rpcrdma_frwr_memreg_ops; break; case RPCRDMA_ALLPHYSICAL: + ia->ri_ops = &rpcrdma_physical_memreg_ops; mem_priv = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ; goto register_setup; case RPCRDMA_MTHCAFMR: + ia->ri_ops = &rpcrdma_fmr_memreg_ops; if (ia->ri_have_dma_lkey) break; mem_priv = IB_ACCESS_LOCAL_WRITE; @@ -678,8 +667,8 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) rc = -ENOMEM; goto out3; } - dprintk("RPC: %s: memory registration strategy is %d\n", - __func__, memreg); + dprintk("RPC: %s: memory registration strategy is '%s'\n", + __func__, ia->ri_ops->ro_displayname); /* Else will do memory reg/dereg for each chunk */ ia->ri_memreg_strategy = memreg; @@ -743,49 +732,11 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall; ep->rep_attr.qp_context = ep; - /* send_cq and recv_cq initialized below */ ep->rep_attr.srq = NULL; ep->rep_attr.cap.max_send_wr = cdata->max_requests; - switch (ia->ri_memreg_strategy) { - case RPCRDMA_FRMR: { - int depth = 7; - - /* Add room for frmr register and invalidate WRs. - * 1. FRMR reg WR for head - * 2. FRMR invalidate WR for head - * 3. N FRMR reg WRs for pagelist - * 4. N FRMR invalidate WRs for pagelist - * 5. FRMR reg WR for tail - * 6. FRMR invalidate WR for tail - * 7. The RDMA_SEND WR - */ - - /* Calculate N if the device max FRMR depth is smaller than - * RPCRDMA_MAX_DATA_SEGS. - */ - if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) { - int delta = RPCRDMA_MAX_DATA_SEGS - - ia->ri_max_frmr_depth; - - do { - depth += 2; /* FRMR reg + invalidate */ - delta -= ia->ri_max_frmr_depth; - } while (delta > 0); - - } - ep->rep_attr.cap.max_send_wr *= depth; - if (ep->rep_attr.cap.max_send_wr > devattr->max_qp_wr) { - cdata->max_requests = devattr->max_qp_wr / depth; - if (!cdata->max_requests) - return -EINVAL; - ep->rep_attr.cap.max_send_wr = cdata->max_requests * - depth; - } - break; - } - default: - break; - } + rc = ia->ri_ops->ro_open(ia, ep, cdata); + if (rc) + return rc; ep->rep_attr.cap.max_recv_wr = cdata->max_requests; ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2); ep->rep_attr.cap.max_recv_sge = 1; @@ -944,21 +895,9 @@ retry: rpcrdma_ep_disconnect(ep, ia); rpcrdma_flush_cqs(ep); - switch (ia->ri_memreg_strategy) { - case RPCRDMA_FRMR: - rpcrdma_reset_frmrs(ia); - break; - case RPCRDMA_MTHCAFMR: - rpcrdma_reset_fmrs(ia); - break; - case RPCRDMA_ALLPHYSICAL: - break; - default: - rc = -EIO; - goto out; - } - xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); + ia->ri_ops->ro_reset(xprt); + id = rpcrdma_create_id(xprt, ia, (struct sockaddr *)&xprt->rx_data.addr); if (IS_ERR(id)) { @@ -1123,91 +1062,6 @@ out: return ERR_PTR(rc); } -static int -rpcrdma_init_fmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf) -{ - int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ; - struct ib_fmr_attr fmr_attr = { - .max_pages = RPCRDMA_MAX_DATA_SEGS, - .max_maps = 1, - .page_shift = PAGE_SHIFT - }; - struct rpcrdma_mw *r; - int i, rc; - - i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS; - dprintk("RPC: %s: initializing %d FMRs\n", __func__, i); - - while (i--) { - r = kzalloc(sizeof(*r), GFP_KERNEL); - if (r == NULL) - return -ENOMEM; - - r->r.fmr = ib_alloc_fmr(ia->ri_pd, mr_access_flags, &fmr_attr); - if (IS_ERR(r->r.fmr)) { - rc = PTR_ERR(r->r.fmr); - dprintk("RPC: %s: ib_alloc_fmr failed %i\n", - __func__, rc); - goto out_free; - } - - list_add(&r->mw_list, &buf->rb_mws); - list_add(&r->mw_all, &buf->rb_all); - } - return 0; - -out_free: - kfree(r); - return rc; -} - -static int -rpcrdma_init_frmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf) -{ - struct rpcrdma_frmr *f; - struct rpcrdma_mw *r; - int i, rc; - - i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS; - dprintk("RPC: %s: initializing %d FRMRs\n", __func__, i); - - while (i--) { - r = kzalloc(sizeof(*r), GFP_KERNEL); - if (r == NULL) - return -ENOMEM; - f = &r->r.frmr; - - f->fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd, - ia->ri_max_frmr_depth); - if (IS_ERR(f->fr_mr)) { - rc = PTR_ERR(f->fr_mr); - dprintk("RPC: %s: ib_alloc_fast_reg_mr " - "failed %i\n", __func__, rc); - goto out_free; - } - - f->fr_pgl = ib_alloc_fast_reg_page_list(ia->ri_id->device, - ia->ri_max_frmr_depth); - if (IS_ERR(f->fr_pgl)) { - rc = PTR_ERR(f->fr_pgl); - dprintk("RPC: %s: ib_alloc_fast_reg_page_list " - "failed %i\n", __func__, rc); - - ib_dereg_mr(f->fr_mr); - goto out_free; - } - - list_add(&r->mw_list, &buf->rb_mws); - list_add(&r->mw_all, &buf->rb_all); - } - - return 0; - -out_free: - kfree(r); - return rc; -} - int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) { @@ -1244,22 +1098,9 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) buf->rb_recv_bufs = (struct rpcrdma_rep **) p; p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests]; - INIT_LIST_HEAD(&buf->rb_mws); - INIT_LIST_HEAD(&buf->rb_all); - switch (ia->ri_memreg_strategy) { - case RPCRDMA_FRMR: - rc = rpcrdma_init_frmrs(ia, buf); - if (rc) - goto out; - break; - case RPCRDMA_MTHCAFMR: - rc = rpcrdma_init_fmrs(ia, buf); - if (rc) - goto out; - break; - default: - break; - } + rc = ia->ri_ops->ro_init(r_xprt); + if (rc) + goto out; for (i = 0; i < buf->rb_max_requests; i++) { struct rpcrdma_req *req; @@ -1311,47 +1152,6 @@ rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req) kfree(req); } -static void -rpcrdma_destroy_fmrs(struct rpcrdma_buffer *buf) -{ - struct rpcrdma_mw *r; - int rc; - - while (!list_empty(&buf->rb_all)) { - r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); - list_del(&r->mw_all); - list_del(&r->mw_list); - - rc = ib_dealloc_fmr(r->r.fmr); - if (rc) - dprintk("RPC: %s: ib_dealloc_fmr failed %i\n", - __func__, rc); - - kfree(r); - } -} - -static void -rpcrdma_destroy_frmrs(struct rpcrdma_buffer *buf) -{ - struct rpcrdma_mw *r; - int rc; - - while (!list_empty(&buf->rb_all)) { - r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); - list_del(&r->mw_all); - list_del(&r->mw_list); - - rc = ib_dereg_mr(r->r.frmr.fr_mr); - if (rc) - dprintk("RPC: %s: ib_dereg_mr failed %i\n", - __func__, rc); - ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); - - kfree(r); - } -} - void rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) { @@ -1372,104 +1172,11 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) rpcrdma_destroy_req(ia, buf->rb_send_bufs[i]); } - switch (ia->ri_memreg_strategy) { - case RPCRDMA_FRMR: - rpcrdma_destroy_frmrs(buf); - break; - case RPCRDMA_MTHCAFMR: - rpcrdma_destroy_fmrs(buf); - break; - default: - break; - } + ia->ri_ops->ro_destroy(buf); kfree(buf->rb_pool); } -/* After a disconnect, unmap all FMRs. - * - * This is invoked only in the transport connect worker in order - * to serialize with rpcrdma_register_fmr_external(). - */ -static void -rpcrdma_reset_fmrs(struct rpcrdma_ia *ia) -{ - struct rpcrdma_xprt *r_xprt = - container_of(ia, struct rpcrdma_xprt, rx_ia); - struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - struct list_head *pos; - struct rpcrdma_mw *r; - LIST_HEAD(l); - int rc; - - list_for_each(pos, &buf->rb_all) { - r = list_entry(pos, struct rpcrdma_mw, mw_all); - - INIT_LIST_HEAD(&l); - list_add(&r->r.fmr->list, &l); - rc = ib_unmap_fmr(&l); - if (rc) - dprintk("RPC: %s: ib_unmap_fmr failed %i\n", - __func__, rc); - } -} - -/* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in - * an unusable state. Find FRMRs in this state and dereg / reg - * each. FRMRs that are VALID and attached to an rpcrdma_req are - * also torn down. - * - * This gives all in-use FRMRs a fresh rkey and leaves them INVALID. - * - * This is invoked only in the transport connect worker in order - * to serialize with rpcrdma_register_frmr_external(). - */ -static void -rpcrdma_reset_frmrs(struct rpcrdma_ia *ia) -{ - struct rpcrdma_xprt *r_xprt = - container_of(ia, struct rpcrdma_xprt, rx_ia); - struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - struct list_head *pos; - struct rpcrdma_mw *r; - int rc; - - list_for_each(pos, &buf->rb_all) { - r = list_entry(pos, struct rpcrdma_mw, mw_all); - - if (r->r.frmr.fr_state == FRMR_IS_INVALID) - continue; - - rc = ib_dereg_mr(r->r.frmr.fr_mr); - if (rc) - dprintk("RPC: %s: ib_dereg_mr failed %i\n", - __func__, rc); - ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); - - r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd, - ia->ri_max_frmr_depth); - if (IS_ERR(r->r.frmr.fr_mr)) { - rc = PTR_ERR(r->r.frmr.fr_mr); - dprintk("RPC: %s: ib_alloc_fast_reg_mr" - " failed %i\n", __func__, rc); - continue; - } - r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list( - ia->ri_id->device, - ia->ri_max_frmr_depth); - if (IS_ERR(r->r.frmr.fr_pgl)) { - rc = PTR_ERR(r->r.frmr.fr_pgl); - dprintk("RPC: %s: " - "ib_alloc_fast_reg_page_list " - "failed %i\n", __func__, rc); - - ib_dereg_mr(r->r.frmr.fr_mr); - continue; - } - r->r.frmr.fr_state = FRMR_IS_INVALID; - } -} - /* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving * some req segments uninitialized. */ @@ -1509,7 +1216,7 @@ rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf) } } -/* rpcrdma_unmap_one() was already done by rpcrdma_deregister_frmr_external(). +/* rpcrdma_unmap_one() was already done during deregistration. * Redo only the ib_post_send(). */ static void @@ -1729,6 +1436,14 @@ rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) * Wrappers for internal-use kmalloc memory registration, used by buffer code. */ +void +rpcrdma_mapping_error(struct rpcrdma_mr_seg *seg) +{ + dprintk("RPC: map_one: offset %p iova %llx len %zu\n", + seg->mr_offset, + (unsigned long long)seg->mr_dma, seg->mr_dmalen); +} + static int rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len, struct ib_mr **mrp, struct ib_sge *iov) @@ -1854,287 +1569,6 @@ rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb) } /* - * Wrappers for chunk registration, shared by read/write chunk code. - */ - -static void -rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing) -{ - seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE; - seg->mr_dmalen = seg->mr_len; - if (seg->mr_page) - seg->mr_dma = ib_dma_map_page(ia->ri_id->device, - seg->mr_page, offset_in_page(seg->mr_offset), - seg->mr_dmalen, seg->mr_dir); - else - seg->mr_dma = ib_dma_map_single(ia->ri_id->device, - seg->mr_offset, - seg->mr_dmalen, seg->mr_dir); - if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) { - dprintk("RPC: %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n", - __func__, - (unsigned long long)seg->mr_dma, - seg->mr_offset, seg->mr_dmalen); - } -} - -static void -rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg) -{ - if (seg->mr_page) - ib_dma_unmap_page(ia->ri_id->device, - seg->mr_dma, seg->mr_dmalen, seg->mr_dir); - else - ib_dma_unmap_single(ia->ri_id->device, - seg->mr_dma, seg->mr_dmalen, seg->mr_dir); -} - -static int -rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, - int *nsegs, int writing, struct rpcrdma_ia *ia, - struct rpcrdma_xprt *r_xprt) -{ - struct rpcrdma_mr_seg *seg1 = seg; - struct rpcrdma_mw *mw = seg1->rl_mw; - struct rpcrdma_frmr *frmr = &mw->r.frmr; - struct ib_mr *mr = frmr->fr_mr; - struct ib_send_wr fastreg_wr, *bad_wr; - u8 key; - int len, pageoff; - int i, rc; - int seg_len; - u64 pa; - int page_no; - - pageoff = offset_in_page(seg1->mr_offset); - seg1->mr_offset -= pageoff; /* start of page */ - seg1->mr_len += pageoff; - len = -pageoff; - if (*nsegs > ia->ri_max_frmr_depth) - *nsegs = ia->ri_max_frmr_depth; - for (page_no = i = 0; i < *nsegs;) { - rpcrdma_map_one(ia, seg, writing); - pa = seg->mr_dma; - for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) { - frmr->fr_pgl->page_list[page_no++] = pa; - pa += PAGE_SIZE; - } - len += seg->mr_len; - ++seg; - ++i; - /* Check for holes */ - if ((i < *nsegs && offset_in_page(seg->mr_offset)) || - offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) - break; - } - dprintk("RPC: %s: Using frmr %p to map %d segments\n", - __func__, mw, i); - - frmr->fr_state = FRMR_IS_VALID; - - memset(&fastreg_wr, 0, sizeof(fastreg_wr)); - fastreg_wr.wr_id = (unsigned long)(void *)mw; - fastreg_wr.opcode = IB_WR_FAST_REG_MR; - fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma; - fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl; - fastreg_wr.wr.fast_reg.page_list_len = page_no; - fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT; - fastreg_wr.wr.fast_reg.length = page_no << PAGE_SHIFT; - if (fastreg_wr.wr.fast_reg.length < len) { - rc = -EIO; - goto out_err; - } - - /* Bump the key */ - key = (u8)(mr->rkey & 0x000000FF); - ib_update_fast_reg_key(mr, ++key); - - fastreg_wr.wr.fast_reg.access_flags = (writing ? - IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : - IB_ACCESS_REMOTE_READ); - fastreg_wr.wr.fast_reg.rkey = mr->rkey; - DECR_CQCOUNT(&r_xprt->rx_ep); - - rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr); - if (rc) { - dprintk("RPC: %s: failed ib_post_send for register," - " status %i\n", __func__, rc); - ib_update_fast_reg_key(mr, --key); - goto out_err; - } else { - seg1->mr_rkey = mr->rkey; - seg1->mr_base = seg1->mr_dma + pageoff; - seg1->mr_nsegs = i; - seg1->mr_len = len; - } - *nsegs = i; - return 0; -out_err: - frmr->fr_state = FRMR_IS_INVALID; - while (i--) - rpcrdma_unmap_one(ia, --seg); - return rc; -} - -static int -rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg, - struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt) -{ - struct rpcrdma_mr_seg *seg1 = seg; - struct ib_send_wr invalidate_wr, *bad_wr; - int rc; - - seg1->rl_mw->r.frmr.fr_state = FRMR_IS_INVALID; - - memset(&invalidate_wr, 0, sizeof invalidate_wr); - invalidate_wr.wr_id = (unsigned long)(void *)seg1->rl_mw; - invalidate_wr.opcode = IB_WR_LOCAL_INV; - invalidate_wr.ex.invalidate_rkey = seg1->rl_mw->r.frmr.fr_mr->rkey; - DECR_CQCOUNT(&r_xprt->rx_ep); - - read_lock(&ia->ri_qplock); - while (seg1->mr_nsegs--) - rpcrdma_unmap_one(ia, seg++); - rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr); - read_unlock(&ia->ri_qplock); - if (rc) { - /* Force rpcrdma_buffer_get() to retry */ - seg1->rl_mw->r.frmr.fr_state = FRMR_IS_STALE; - dprintk("RPC: %s: failed ib_post_send for invalidate," - " status %i\n", __func__, rc); - } - return rc; -} - -static int -rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg, - int *nsegs, int writing, struct rpcrdma_ia *ia) -{ - struct rpcrdma_mr_seg *seg1 = seg; - u64 physaddrs[RPCRDMA_MAX_DATA_SEGS]; - int len, pageoff, i, rc; - - pageoff = offset_in_page(seg1->mr_offset); - seg1->mr_offset -= pageoff; /* start of page */ - seg1->mr_len += pageoff; - len = -pageoff; - if (*nsegs > RPCRDMA_MAX_DATA_SEGS) - *nsegs = RPCRDMA_MAX_DATA_SEGS; - for (i = 0; i < *nsegs;) { - rpcrdma_map_one(ia, seg, writing); - physaddrs[i] = seg->mr_dma; - len += seg->mr_len; - ++seg; - ++i; - /* Check for holes */ - if ((i < *nsegs && offset_in_page(seg->mr_offset)) || - offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) - break; - } - rc = ib_map_phys_fmr(seg1->rl_mw->r.fmr, physaddrs, i, seg1->mr_dma); - if (rc) { - dprintk("RPC: %s: failed ib_map_phys_fmr " - "%u@0x%llx+%i (%d)... status %i\n", __func__, - len, (unsigned long long)seg1->mr_dma, - pageoff, i, rc); - while (i--) - rpcrdma_unmap_one(ia, --seg); - } else { - seg1->mr_rkey = seg1->rl_mw->r.fmr->rkey; - seg1->mr_base = seg1->mr_dma + pageoff; - seg1->mr_nsegs = i; - seg1->mr_len = len; - } - *nsegs = i; - return rc; -} - -static int -rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg, - struct rpcrdma_ia *ia) -{ - struct rpcrdma_mr_seg *seg1 = seg; - LIST_HEAD(l); - int rc; - - list_add(&seg1->rl_mw->r.fmr->list, &l); - rc = ib_unmap_fmr(&l); - read_lock(&ia->ri_qplock); - while (seg1->mr_nsegs--) - rpcrdma_unmap_one(ia, seg++); - read_unlock(&ia->ri_qplock); - if (rc) - dprintk("RPC: %s: failed ib_unmap_fmr," - " status %i\n", __func__, rc); - return rc; -} - -int -rpcrdma_register_external(struct rpcrdma_mr_seg *seg, - int nsegs, int writing, struct rpcrdma_xprt *r_xprt) -{ - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - int rc = 0; - - switch (ia->ri_memreg_strategy) { - - case RPCRDMA_ALLPHYSICAL: - rpcrdma_map_one(ia, seg, writing); - seg->mr_rkey = ia->ri_bind_mem->rkey; - seg->mr_base = seg->mr_dma; - seg->mr_nsegs = 1; - nsegs = 1; - break; - - /* Registration using frmr registration */ - case RPCRDMA_FRMR: - rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt); - break; - - /* Registration using fmr memory registration */ - case RPCRDMA_MTHCAFMR: - rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia); - break; - - default: - return -EIO; - } - if (rc) - return rc; - - return nsegs; -} - -int -rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg, - struct rpcrdma_xprt *r_xprt) -{ - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - int nsegs = seg->mr_nsegs, rc; - - switch (ia->ri_memreg_strategy) { - - case RPCRDMA_ALLPHYSICAL: - read_lock(&ia->ri_qplock); - rpcrdma_unmap_one(ia, seg); - read_unlock(&ia->ri_qplock); - break; - - case RPCRDMA_FRMR: - rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt); - break; - - case RPCRDMA_MTHCAFMR: - rc = rpcrdma_deregister_fmr_external(seg, ia); - break; - - default: - break; - } - return nsegs; -} - -/* * Prepost any receive buffer, then post send. * * Receive buffer is donated to hardware, reclaimed upon recv completion. @@ -2156,7 +1590,7 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, } send_wr.next = NULL; - send_wr.wr_id = 0ULL; /* no send cookie */ + send_wr.wr_id = RPCRDMA_IGNORE_COMPLETION; send_wr.sg_list = req->rl_send_iov; send_wr.num_sge = req->rl_niovs; send_wr.opcode = IB_WR_SEND; @@ -2215,43 +1649,24 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia, return rc; } -/* Physical mapping means one Read/Write list entry per-page. - * All list entries must fit within an inline buffer - * - * NB: The server must return a Write list for NFS READ, - * which has the same constraint. Factor in the inline - * rsize as well. +/* How many chunk list items fit within our inline buffers? */ -static size_t -rpcrdma_physical_max_payload(struct rpcrdma_xprt *r_xprt) +unsigned int +rpcrdma_max_segments(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; - unsigned int inline_size, pages; + int bytes, segments; - inline_size = min_t(unsigned int, - cdata->inline_wsize, cdata->inline_rsize); - inline_size -= RPCRDMA_HDRLEN_MIN; - pages = inline_size / sizeof(struct rpcrdma_segment); - return pages << PAGE_SHIFT; -} - -static size_t -rpcrdma_mr_max_payload(struct rpcrdma_xprt *r_xprt) -{ - return RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT; -} - -size_t -rpcrdma_max_payload(struct rpcrdma_xprt *r_xprt) -{ - size_t result; - - switch (r_xprt->rx_ia.ri_memreg_strategy) { - case RPCRDMA_ALLPHYSICAL: - result = rpcrdma_physical_max_payload(r_xprt); - break; - default: - result = rpcrdma_mr_max_payload(r_xprt); + bytes = min_t(unsigned int, cdata->inline_wsize, cdata->inline_rsize); + bytes -= RPCRDMA_HDRLEN_MIN; + if (bytes < sizeof(struct rpcrdma_segment) * 2) { + pr_warn("RPC: %s: inline threshold too small\n", + __func__); + return 0; } - return result; + + segments = 1 << (fls(bytes / sizeof(struct rpcrdma_segment)) - 1); + dprintk("RPC: %s: max chunk list size = %d segments\n", + __func__, segments); + return segments; } diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 0a16fb6..78e0b8b 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -60,6 +60,7 @@ * Interface Adapter -- one per transport instance */ struct rpcrdma_ia { + const struct rpcrdma_memreg_ops *ri_ops; rwlock_t ri_qplock; struct rdma_cm_id *ri_id; struct ib_pd *ri_pd; @@ -105,6 +106,10 @@ struct rpcrdma_ep { #define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit) #define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount) +/* Force completion handler to ignore the signal + */ +#define RPCRDMA_IGNORE_COMPLETION (0ULL) + /* Registered buffer -- registered kmalloc'd memory for RDMA SEND/RECV * * The below structure appears at the front of a large region of kmalloc'd @@ -143,14 +148,6 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb) return (struct rpcrdma_msg *)rb->rg_base; } -enum rpcrdma_chunktype { - rpcrdma_noch = 0, - rpcrdma_readch, - rpcrdma_areadch, - rpcrdma_writech, - rpcrdma_replych -}; - /* * struct rpcrdma_rep -- this structure encapsulates state required to recv * and complete a reply, asychronously. It needs several pieces of @@ -213,6 +210,7 @@ struct rpcrdma_mw { struct ib_fmr *fmr; struct rpcrdma_frmr frmr; } r; + void (*mw_sendcompletion)(struct ib_wc *); struct list_head mw_list; struct list_head mw_all; }; @@ -258,7 +256,6 @@ struct rpcrdma_req { unsigned int rl_niovs; /* 0, 2 or 4 */ unsigned int rl_nchunks; /* non-zero if chunks */ unsigned int rl_connect_cookie; /* retry detection */ - enum rpcrdma_chunktype rl_rtype, rl_wtype; struct rpcrdma_buffer *rl_buffer; /* home base for this structure */ struct rpcrdma_rep *rl_reply;/* holder for reply buffer */ struct ib_sge rl_send_iov[4]; /* for active requests */ @@ -340,6 +337,29 @@ struct rpcrdma_stats { }; /* + * Per-registration mode operations + */ +struct rpcrdma_xprt; +struct rpcrdma_memreg_ops { + int (*ro_map)(struct rpcrdma_xprt *, + struct rpcrdma_mr_seg *, int, bool); + int (*ro_unmap)(struct rpcrdma_xprt *, + struct rpcrdma_mr_seg *); + int (*ro_open)(struct rpcrdma_ia *, + struct rpcrdma_ep *, + struct rpcrdma_create_data_internal *); + size_t (*ro_maxpages)(struct rpcrdma_xprt *); + int (*ro_init)(struct rpcrdma_xprt *); + void (*ro_reset)(struct rpcrdma_xprt *); + void (*ro_destroy)(struct rpcrdma_buffer *); + const char *ro_displayname; +}; + +extern const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops; +extern const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops; +extern const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops; + +/* * RPCRDMA transport -- encapsulates the structures above for * integration with RPC. * @@ -398,16 +418,56 @@ void rpcrdma_buffer_put(struct rpcrdma_req *); void rpcrdma_recv_buffer_get(struct rpcrdma_req *); void rpcrdma_recv_buffer_put(struct rpcrdma_rep *); -int rpcrdma_register_external(struct rpcrdma_mr_seg *, - int, int, struct rpcrdma_xprt *); -int rpcrdma_deregister_external(struct rpcrdma_mr_seg *, - struct rpcrdma_xprt *); - struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *, size_t, gfp_t); void rpcrdma_free_regbuf(struct rpcrdma_ia *, struct rpcrdma_regbuf *); +unsigned int rpcrdma_max_segments(struct rpcrdma_xprt *); + +/* + * Wrappers for chunk registration, shared by read/write chunk code. + */ + +void rpcrdma_mapping_error(struct rpcrdma_mr_seg *); + +static inline enum dma_data_direction +rpcrdma_data_dir(bool writing) +{ + return writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE; +} + +static inline void +rpcrdma_map_one(struct ib_device *device, struct rpcrdma_mr_seg *seg, + enum dma_data_direction direction) +{ + seg->mr_dir = direction; + seg->mr_dmalen = seg->mr_len; + + if (seg->mr_page) + seg->mr_dma = ib_dma_map_page(device, + seg->mr_page, offset_in_page(seg->mr_offset), + seg->mr_dmalen, seg->mr_dir); + else + seg->mr_dma = ib_dma_map_single(device, + seg->mr_offset, + seg->mr_dmalen, seg->mr_dir); + + if (ib_dma_mapping_error(device, seg->mr_dma)) + rpcrdma_mapping_error(seg); +} + +static inline void +rpcrdma_unmap_one(struct ib_device *device, struct rpcrdma_mr_seg *seg) +{ + if (seg->mr_page) + ib_dma_unmap_page(device, + seg->mr_dma, seg->mr_dmalen, seg->mr_dir); + else + ib_dma_unmap_single(device, + seg->mr_dma, seg->mr_dmalen, seg->mr_dir); +} + /* * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c */ @@ -418,9 +478,7 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *); /* * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c */ -ssize_t rpcrdma_marshal_chunks(struct rpc_rqst *, ssize_t); int rpcrdma_marshal_req(struct rpc_rqst *); -size_t rpcrdma_max_payload(struct rpcrdma_xprt *); /* Temporary NFS request map cache. Created in svc_rdma.c */ extern struct kmem_cache *svc_rdma_map_cachep; |